From 22a0ac0fc4ae0899e07564323a1a789602043ed8 Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 13 Jul 2023 15:15:41 +0200 Subject: [PATCH 01/96] [CODEGEN] Added GPU abstraction to CODEGEN --- .../iolibs/template_files/gpu/Bridge.h | 30 +- .../template_files/gpu/BridgeKernels.cc | 7 +- .../iolibs/template_files/gpu/BridgeKernels.h | 6 +- .../gpu/CommonRandomNumberKernel.cc | 3 +- .../template_files/gpu/CrossSectionKernels.cc | 5 +- .../template_files/gpu/CrossSectionKernels.h | 4 +- .../iolibs/template_files/gpu/CudaRuntime.h | 4 +- .../gpu/CurandRandomNumberKernel.cc | 10 +- .../template_files/gpu/EventStatistics.h | 2 +- .../template_files/gpu/GpuAbstraction.h | 79 +++++ .../iolibs/template_files/gpu/GpuRuntime.h | 80 +++++ .../iolibs/template_files/gpu/MadgraphTest.h | 6 +- .../gpu/MatrixElementKernels.cc | 24 +- .../template_files/gpu/MatrixElementKernels.h | 6 +- .../template_files/gpu/MemoryAccessHelpers.h | 2 +- .../template_files/gpu/MemoryAccessMomenta.h | 24 +- .../gpu/MemoryAccessRandomNumbers.h | 2 +- .../template_files/gpu/MemoryAccessVectors.h | 2 +- .../iolibs/template_files/gpu/MemoryBuffers.h | 61 ++-- .../gpu/RamboSamplingKernels.cc | 18 +- .../template_files/gpu/RamboSamplingKernels.h | 4 +- .../template_files/gpu/RandomNumberKernels.h | 4 +- .../iolibs/template_files/gpu/check_sa.cc | 74 ++-- .../template_files/gpu/cpp_hel_amps_h.inc | 2 +- .../iolibs/template_files/gpu/cudacpp.mk | 323 +++++++++++------- .../iolibs/template_files/gpu/fbridge.cc | 10 +- .../iolibs/template_files/gpu/fsampler.cc | 6 +- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 25 +- .../template_files/gpu/mgOnGpuCxtypes.h | 16 +- .../template_files/gpu/mgOnGpuFptypes.h | 8 +- .../template_files/gpu/mgOnGpuVectors.h | 18 +- .../iolibs/template_files/gpu/process_cc.inc | 2 +- .../gpu/process_function_definitions.inc | 32 +- .../iolibs/template_files/gpu/process_h.inc | 8 +- .../template_files/gpu/process_matrix.inc | 8 +- .../gpu/process_sigmaKin_function.inc | 9 +- .../iolibs/template_files/gpu/rambo.h | 6 +- .../iolibs/template_files/gpu/runTest.cc | 10 +- .../iolibs/template_files/gpu/testmisc.cc | 2 +- .../iolibs/template_files/gpu/testxxx.cc | 6 +- .../PLUGIN/CUDACPP_SA_OUTPUT/output.py | 6 +- 41 files changed, 601 insertions(+), 353 deletions(-) create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h create mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index bf8b5e024d..51241e9840 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc index d58066c9c1..6034db93ec 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc @@ -5,13 +5,14 @@ #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h index 15eb4bff4d..7c7feb692a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc index 985b39f576..f17b9c0ad7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc @@ -4,12 +4,13 @@ // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc index 0b355a3c8d..36ca2a94d4 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc @@ -5,6 +5,7 @@ #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h index 7933ca4bbf..ff2350a14d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h index 64ce52f4b3..df0c3f3df8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h @@ -15,7 +15,7 @@ //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ #define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) { @@ -29,7 +29,7 @@ inline void assertCuda( cudaError_t code, const char* file, int line, bool abort //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { // Instantiate a CudaRuntime at the beginnining of the application's main to diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index eb56333b03..5b33207ad0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -3,7 +3,7 @@ // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. // Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_CUDACC namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_CUDACC if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h index 48b51e0a49..e7d7f3b3c3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h new file mode 100644 index 0000000000..98a0124b55 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -0,0 +1,79 @@ +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +#ifdef MGONGPUCPP_GPUIMPL + #define MGONGPUCPP_CUDACC 1 +#endif + +#ifdef __HIPCC__ + #include "hip/hip_runtime.h" + #define MGONGPUCPP_HIPCC 1 +#endif + +#ifdef MGONGPUCPP_CUDACC + + // Defines correct compiler + #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL + + //-------------------------------------------------------------------------- + + #define gpuError_t cudaError_t + #define gpuPeekAtLastError cudaPeekAtLastError + #define gpuGetErrorString cudaGetErrorString + #define gpuSuccess cudaSuccess + + #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) ) + #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) ) + + #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) ) + #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice + #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost + #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) ) + + #define gpuFree(ptr) checkGpu( cudaFree(ptr) ) + #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) ) + + #define gpuSetDevice cudaSetDevice + #define gpuDeviceSynchronize cudaDeviceSynchronize + #define gpuDeviceReset cudaDeviceReset + + #define gpuLaunchKernel( kernel, blocks, threads, ...) kernel<<>> (__VA_ARGS__) + #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<>>(__VA_ARGS__) + +//-------------------------------------------------------------------------- + +#elif defined MGONGPUCPP_HIPCC + + // Defines correct compiler + #define MGONGPUCPP_GPUIMPL __HCC__ + + //-------------------------------------------------------------------------- + + #define gpuError_t hipError_t + #define gpuPeekAtLastError hipPeekAtLastError + #define gpuGetErrorString hipGetErrorString + #define gpuSuccess hipSuccess + + #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better + #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) ) + + #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) ) + #define gpuMemcpyHostToDevice hipMemcpyHostToDevice + #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost + #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) ) + + #define gpuFree(ptr) checkGpu( hipFree(ptr) ) + #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) ) + + #define gpuSetDevice hipSetDevice + #define gpuDeviceSynchronize hipDeviceSynchronize + #define gpuDeviceReset hipDeviceReset + + #define gpuLaunchKernel( kernel, blocks, threads, ...) kernel<<>> (__VA_ARGS__) + #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<>>(__VA_ARGS__) + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H \ No newline at end of file diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h new file mode 100644 index 0000000000..86c9179f4c --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -0,0 +1,80 @@ +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu ( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h index ffe3b84d53..3fa9f13a82 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h @@ -21,7 +21,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -200,7 +200,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -309,6 +309,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 30257195b6..dd3eee4ea3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -6,7 +6,7 @@ #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -143,7 +143,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -202,13 +202,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -219,19 +219,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 23e84757a2..4477a385ed 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h index c82a6c7635..67306c3922 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h index 29266de32c..dc4bb2aa22 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -28,17 +28,17 @@ namespace mg5amcCpu { public: - // Number of Events Per Page in the momenta AOSOA memory buffer layout - // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ - // ----------------------------------------------------------------------------------------------- - // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline - // --- This is relevant to ensure coalesced access to momenta in global memory - // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms - // ----------------------------------------------------------------------------------------------- - //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) - static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) - //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) + // Number of Events Per Page in the momenta AOSOA memory buffer layout + // (these are all best kept as a compile-time constants: see issue #23) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ + // ----------------------------------------------------------------------------------------------- + // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline + // --- This is relevant to ensure coalesced access to momenta in global memory + // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms + // ----------------------------------------------------------------------------------------------- + //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) + static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) + //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) #else // ----------------------------------------------------------------------------------------------- // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h index e2988d39f3..949a42066d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h index e9b197368e..a9ae26b6dc 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index 48306a9d41..d6ba45dcad 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -11,12 +11,11 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" #include "Parameters_%(model_name)s.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +86,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +127,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +147,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +174,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +190,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +212,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +231,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +256,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +275,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +295,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +314,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +332,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +351,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +369,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +384,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +402,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +420,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +438,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +456,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +474,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +486,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +503,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +526,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc index da68aa9255..8745b084d3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc @@ -5,7 +5,7 @@ #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h index 184089efd7..fe63a7bb77 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h index 188a72c2c9..0c215f2583 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h @@ -8,7 +8,7 @@ #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index f5f08dc64e..9a39220077 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -63,7 +63,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -77,7 +77,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -102,7 +102,7 @@ main( int argc, char** argv ) CurandHost = 1, CurandDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU #elif not defined MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand @@ -115,7 +115,7 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU @@ -145,7 +145,7 @@ main( int argc, char** argv ) } else if( arg == "--curdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rndgen = RandomNumberMode::CurandDevice; #else throw std::runtime_error( "CurandDevice is not supported on CPUs" ); @@ -165,7 +165,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -239,13 +239,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -263,7 +263,7 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 00. Initialise cuda // Instantiate a CudaRuntime at the beginnining of the application's main to @@ -292,7 +292,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -300,7 +300,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -308,7 +308,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -316,7 +316,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -333,7 +333,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -342,7 +342,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -351,7 +351,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -359,7 +359,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -367,7 +367,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -394,7 +394,7 @@ main( int argc, char** argv ) const bool onDevice = false; prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL else { const bool onDevice = true; @@ -421,7 +421,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -432,7 +432,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -440,7 +440,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -482,7 +482,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -514,7 +514,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -559,7 +559,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -588,7 +588,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -729,7 +729,7 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rndgentxt += " (CUDA code)"; #else rndgentxt += " (C++ code)"; @@ -738,7 +738,7 @@ main( int argc, char** argv ) // Workflow description summary std::string wrkflwtxt; // -- CUDA or C++? -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL wrkflwtxt += "CUD:"; #else wrkflwtxt += "CPP:"; @@ -754,7 +754,7 @@ main( int argc, char** argv ) wrkflwtxt += "???+"; // no path to this statement #endif // -- CUCOMPLEX or THRUST or STD complex numbers? -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; #elif defined MGONGPU_CUCXTYPE_THRUST @@ -789,7 +789,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -845,7 +845,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -864,7 +864,7 @@ main( int argc, char** argv ) #endif // Dump all configuration parameters and all results std::cout << std::string( SEP79, '*' ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" @@ -892,7 +892,7 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST @@ -906,7 +906,7 @@ main( int argc, char** argv ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -937,7 +937,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1033,7 +1033,7 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST @@ -1048,7 +1048,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc index 562af241af..594fb770c5 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc @@ -26,7 +26,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 222d75f846..d98dca1eb3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -32,7 +32,7 @@ UNAME_P := $(shell uname -p) #=== Configure common compiler flags for C++ and CUDA INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -103,71 +103,139 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler - -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below -ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") - override CUDA_HOME=disabled -endif - -# If CUDA_HOME is not set, try to set it from the location of nvcc -ifndef CUDA_HOME - CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) - $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") -endif - -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists -ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc - USE_NVTX ?=-DUSE_NVTX - # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html - # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # Embed device code for 70, and PTX for 70+. - # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). - # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 - comma:=, - CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) - CUINC = -I$(CUDA_HOME)/include/ - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h - # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) -else ifneq ($(origin REQUIRE_CUDA),undefined) - # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) -else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ - $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= - override USE_NVTX= - override CUINC= - override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS +CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler") +HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler") + +ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) + #=== Configure the CUDA compiler + + # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) + # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below + ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + override CUDA_HOME=disabled + endif + + # If CUDA_HOME is not set, try to set it from the location of NVCC + ifndef CUDA_HOME + CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") + endif + + # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists + ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) + GPUCC = $(CUDA_HOME)/bin/nvcc + USE_NVTX ?=-DUSE_NVTX + # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). + # Embed device code for 70, and PTX for 70+. + # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). + # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). + MADGRAPH_CUDA_ARCHITECTURE ?= 70 + ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + comma:=, + CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + CUINC = -I$(CUDA_HOME)/include/ + CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! + CUOPTFLAGS = -lineinfo + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + + CUDATESTFLAGS = -lcuda + + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + else + # No cuda. Switch cuda compilation off and go to common random numbers in C++ + $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) + override GPUCC= + override USE_NVTX= + override CUINC= + override CURANDLIBFLAGS= + endif + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) +else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) + #=== Configure the HIP compiler + + # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) + # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below + ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + override CUDA_HOME=disabled + endif + + # If HIP_HOME is not set, try to set it from the location of GPUCC + ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") + endif + + # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists + ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + GPUCC = $(HIP_HOME)/bin/hipcc + + # Should maybe find something equivelant to this in HIP + #USE_NVTX ?=-DUSE_NVTX + + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + + # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP + # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + + else ifneq ($(origin REQUIRE_HIP),undefined) + # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + else + # No hip. Switch hip compilation off and go to common random numbers in C++ + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= + override USE_NVTX= + override CUINC= + override CURANDLIBFLAGS= + endif + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif + #------------------------------------------------------------------------------- #=== Configure ccache for C++ and CUDA builds @@ -179,9 +247,9 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif @@ -191,11 +259,11 @@ endif # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) - CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4 + CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4 # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change ###CXXFLAGS+= -fpeel-loops # no change - ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4 + ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4 ###CXXFLAGS+= -ftree-vectorize # no change ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6! else @@ -205,7 +273,7 @@ endif # PowerPC-specific CUDA compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -215,10 +283,10 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) override OMPFLAGS = # disable OpenMP MT on Apple clang (builds fail in the CI #578) else @@ -269,7 +337,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -344,13 +415,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -359,7 +430,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -368,7 +439,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -420,11 +491,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -437,7 +508,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -468,28 +539,32 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) -$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +ifneq ($(GPUCC),) +$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ -$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -505,10 +580,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -516,8 +591,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -544,7 +619,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -556,17 +631,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- # Target (and build rules): Fortran include files -###$(INCDIR)/%%.inc : ../%%.inc +###$(INCDIR)/%.inc : ../%.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### \cp $< $@ @@ -577,27 +652,27 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%%.o : %%.f *.inc +$(BUILDDIR)/%.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%%.o : %%.f *.inc +###$(BUILDDIR)/%.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ @@ -612,17 +687,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -634,7 +709,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -647,7 +722,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -659,14 +734,14 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(testmain): $(BUILDDIR)/runTest_cu.o $(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o @@ -688,14 +763,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 @@ -798,9 +873,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -819,7 +894,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck @@ -850,14 +925,14 @@ cmpFcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events) cmpFGcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) memcheck: all.$(TAG) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 2d2b36d560..34ca33ded6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -5,7 +5,7 @@ #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,7 +46,7 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL CudaRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) @@ -65,7 +65,7 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL CudaRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc index 2fb445372d..acffa7c19e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 02bfdcf8f5..5b04029787 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -6,6 +6,8 @@ #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 +#include "GpuRuntime.h" // Includes the GPU abstraction + // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) %(mgongpu_supports_multichannel)s @@ -15,9 +17,10 @@ // Choose if curand is supported for generating random numbers // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_CUDACC #undef MGONGPU_HAS_NO_CURAND -#else +#elif defined MGONGPUCPP_HIPCC +#define MGONGPU_HAS_NO_CURAND 1 //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 #endif @@ -53,20 +56,20 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif // Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) #endif // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #undef MGONGPU_NSIGHT_DEBUG // default //#define MGONGPU_NSIGHT_DEBUG 1 #endif @@ -85,14 +88,14 @@ #endif // SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL #endif #endif // SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif @@ -132,7 +135,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -143,7 +146,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,7 +178,7 @@ using mgOnGpu::fptype2; // Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -188,7 +191,7 @@ using mgOnGpu::fptype2; #endif /* clang-format on */ // Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index b56348bc58..b5e1f1a495 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -216,7 +216,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -301,7 +301,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -337,11 +337,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -556,11 +556,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -604,7 +604,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h index 905c97d700..d9a955c235 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index e1299ba81e..de12c1d24f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -9,6 +9,8 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" +#include "GpuAbstraction.h" + #include //========================================================================== @@ -131,7 +133,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +155,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +807,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +855,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +881,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 778e210468..9dceb45708 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -14,7 +14,7 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" %(hel_amps_h)s #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index c3c0812b94..d4e999733f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -10,7 +10,9 @@ // Class member functions for calculating the matrix elements for %(process_lines)s -#ifdef __CUDACC__ +#include "GpuRuntime.h" + +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -44,7 +46,7 @@ namespace mg5amcCpu %(cipdhrdcod)s %(cipchrdcod)s #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL %(cipddevice)s %(cipcdevice)s #else @@ -54,7 +56,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -80,8 +82,8 @@ namespace mg5amcCpu // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283] // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 *** %(all_helicities)s -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -117,7 +119,7 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory %(cipdassign)s %(cipcassign)s -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL %(cipd2tipdSym)s %(cipc2tipcSym)s #else @@ -150,7 +152,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -215,12 +217,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -241,7 +243,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -367,9 +369,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -393,7 +395,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -414,7 +416,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 893f7f3215..8a9de336f2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -23,7 +23,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -32,7 +32,7 @@ namespace mg5amcCpu %(process_class_definitions)s //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -45,7 +45,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -75,7 +75,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 1e473edcf8..241c50a9d1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -7,6 +7,8 @@ ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== +#include "GpuAbstraction.h" + // *** COLOR CHOICE BELOW *** // Store the leading color flows for choice of color if( jamp2_sv ) // disable color choice if nullptr @@ -17,7 +19,7 @@ // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?) %(color_matrix_lines)s -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -74,7 +76,7 @@ #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -133,7 +135,7 @@ MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 9fcd58196b..59c1623c5a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -6,9 +6,12 @@ ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -36,7 +39,7 @@ // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -240,7 +243,7 @@ // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h index e02ea52496..3a331b979a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index d4a760a71b..6f20a7248a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 895d6eeb56..5d00e2c06c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc index 3e6569b553..6f8736c120 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -76,7 +76,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt %% neppM == 0 ); // nevt must be a multiple of neppM assert( nevt %% neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -321,7 +321,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index aebab6f1a7..a947f262b0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -86,9 +86,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', - s+'CMake/src/CMakeLists.txt'], + s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', - s+'gpu/ompnumthreads.h', s+'gpu/CudaRuntime.h', + s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h', s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h', @@ -109,7 +109,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'CMake/SubProcesses/CMakeLists.txt'], 'test': [s+'gpu/cudacpp_test.mk']} to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h', - 'ompnumthreads.h', 'CudaRuntime.h', + 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', From 81cf765fb205389563a1627f595215474c87d983 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 13 Jul 2023 15:55:32 +0200 Subject: [PATCH 02/96] [jthip] change % to %% in CODEGEN cudacpp.mk --- .../iolibs/template_files/gpu/cudacpp.mk | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index d98dca1eb3..f024f15ce7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -118,7 +118,7 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) # If CUDA_HOME is not set, try to set it from the location of NVCC ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif @@ -188,7 +188,7 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) # If HIP_HOME is not set, try to set it from the location of GPUCC ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -259,11 +259,11 @@ endif # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) - CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4 + CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4 # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change ###CXXFLAGS+= -fpeel-loops # no change - ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4 + ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4 ###CXXFLAGS+= -ftree-vectorize # no change ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6! else @@ -540,11 +540,11 @@ $(BUILDDIR)/.build.$(TAG): # Generic target and build rules: objects from CUDA compilation ifneq ($(GPUCC),) -$(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ -$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif @@ -552,7 +552,7 @@ endif # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ @@ -641,7 +641,7 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): Fortran include files -###$(INCDIR)/%.inc : ../%.inc +###$(INCDIR)/%%.inc : ../%%.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### \cp $< $@ @@ -657,7 +657,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),) $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o @@ -667,12 +667,12 @@ endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%%.o : %%.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%%.o : %%.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ @@ -741,7 +741,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),) $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(testmain): $(BUILDDIR)/runTest_cu.o $(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o @@ -925,14 +925,14 @@ cmpFcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events) cmpFGcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) memcheck: all.$(TAG) From b83f8c94157989c99937013fa3f2756de07a99e9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 13 Jul 2023 16:25:15 +0200 Subject: [PATCH 03/96] [jthip] clang-format GpuAbstraction.h both in CODEGEN and in ggttgg.mad --- .../template_files/gpu/GpuAbstraction.h | 86 +++++++++---------- .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 79 +++++++++++++++++ 2 files changed, 122 insertions(+), 43 deletions(-) create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 98a0124b55..2f000e33d1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -4,75 +4,75 @@ #include #ifdef MGONGPUCPP_GPUIMPL - #define MGONGPUCPP_CUDACC 1 +#define MGONGPUCPP_CUDACC 1 #endif #ifdef __HIPCC__ - #include "hip/hip_runtime.h" - #define MGONGPUCPP_HIPCC 1 +#include "hip/hip_runtime.h" +#define MGONGPUCPP_HIPCC 1 #endif #ifdef MGONGPUCPP_CUDACC - // Defines correct compiler - #define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL +// Defines correct compiler +#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL - //-------------------------------------------------------------------------- +//-------------------------------------------------------------------------- - #define gpuError_t cudaError_t - #define gpuPeekAtLastError cudaPeekAtLastError - #define gpuGetErrorString cudaGetErrorString - #define gpuSuccess cudaSuccess +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess - #define gpuMallocHost(ptr, size) checkGpu( cudaMallocHost(ptr, size) ) - #define gpuMalloc(ptr, size) checkGpu( cudaMalloc(ptr, size) ) +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) - #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( cudaMemcpy(dstData, srcData, srcBytes, func) ) - #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice - #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost - #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( cudaMemcpyToSymbol(type1, type2, size) ) +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) - #define gpuFree(ptr) checkGpu( cudaFree(ptr) ) - #define gpuFreeHost(ptr) checkGpu( cudaFreeHost(ptr) ) +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) - #define gpuSetDevice cudaSetDevice - #define gpuDeviceSynchronize cudaDeviceSynchronize - #define gpuDeviceReset cudaDeviceReset +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset - #define gpuLaunchKernel( kernel, blocks, threads, ...) kernel<<>> (__VA_ARGS__) - #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<>>(__VA_ARGS__) +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) //-------------------------------------------------------------------------- #elif defined MGONGPUCPP_HIPCC - // Defines correct compiler - #define MGONGPUCPP_GPUIMPL __HCC__ +// Defines correct compiler +#define MGONGPUCPP_GPUIMPL __HCC__ - //-------------------------------------------------------------------------- +//-------------------------------------------------------------------------- - #define gpuError_t hipError_t - #define gpuPeekAtLastError hipPeekAtLastError - #define gpuGetErrorString hipGetErrorString - #define gpuSuccess hipSuccess +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess - #define gpuMallocHost(ptr, size) checkGpu( hipHostMalloc(ptr, size) ) // HostMalloc better - #define gpuMalloc(ptr, size) checkGpu( hipMalloc(ptr, size) ) +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) - #define gpuMemcpy(dstData, srcData, srcBytes, func) checkGpu( hipMemcpy(dstData, srcData, srcBytes, func) ) - #define gpuMemcpyHostToDevice hipMemcpyHostToDevice - #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost - #define gpuMemcpyToSymbol(type1, type2, size) checkGpu( hipMemcpyToSymbol(type1, type2, size) ) +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) - #define gpuFree(ptr) checkGpu( hipFree(ptr) ) - #define gpuFreeHost(ptr) checkGpu( hipHostFree(ptr) ) +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) - #define gpuSetDevice hipSetDevice - #define gpuDeviceSynchronize hipDeviceSynchronize - #define gpuDeviceReset hipDeviceReset +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset - #define gpuLaunchKernel( kernel, blocks, threads, ...) kernel<<>> (__VA_ARGS__) - #define gpuLaunchKernelSharedMem(kernel, blocks, threads, sharedMem, ...) kernel<<>>(__VA_ARGS__) +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..2f000e33d1 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,79 @@ +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +#ifdef MGONGPUCPP_GPUIMPL +#define MGONGPUCPP_CUDACC 1 +#endif + +#ifdef __HIPCC__ +#include "hip/hip_runtime.h" +#define MGONGPUCPP_HIPCC 1 +#endif + +#ifdef MGONGPUCPP_CUDACC + +// Defines correct compiler +#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL + +//-------------------------------------------------------------------------- + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined MGONGPUCPP_HIPCC + +// Defines correct compiler +#define MGONGPUCPP_GPUIMPL __HCC__ + +//-------------------------------------------------------------------------- + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H \ No newline at end of file From 1afbafc2a84c6a8d2d4c9e867e3fb8baae0843c5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 13 Jul 2023 16:27:55 +0200 Subject: [PATCH 04/96] [jthip] clang-format GpuRuntime.h both in CODEGEN and in ggttgg.mad --- .../madgraph/iolibs/template_files/gpu/GpuRuntime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h index 86c9179f4c..895a662e52 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -59,7 +59,7 @@ namespace mg5amcGpu // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; - checkGpu ( gpuSetDevice( 0 ) ); // SLOW! + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) From d1f5c5ba18ab19e76d818c052528c13d3af3d756 Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Mon, 17 Jul 2023 14:47:32 +0200 Subject: [PATCH 05/96] Made the codegenerated files same as the templated files in gg_ttgg --- .../madgraph/iolibs/template_files/gpu/Bridge.h | 4 ++-- .../iolibs/template_files/gpu/MatrixElementKernels.cc | 4 ++-- .../madgraph/iolibs/template_files/gpu/fbridge.cc | 4 ++-- .../cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 51241e9840..bcdfe29154 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -291,11 +291,11 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index dd3eee4ea3..a9e20e114f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -226,9 +226,9 @@ namespace mg5amcGpu constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - gpuLaunchKernelSharedmem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif checkGpu( gpuPeekAtLastError() ); checkGpu( gpuDeviceSynchronize() ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 34ca33ded6..592a8c74bb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -47,7 +47,7 @@ extern "C" void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { #ifdef MGONGPUCPP_GPUIMPL - CudaRuntime::setUp(); + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -66,7 +66,7 @@ extern "C" if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; #ifdef MGONGPUCPP_GPUIMPL - CudaRuntime::tearDown(); + GpuRuntime::tearDown(); #endif } diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index 30257195b6..7a3d4c1b75 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -6,7 +6,7 @@ #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -202,7 +202,7 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else From 1b5c0fdff6208b18ecb2e292571c5aea9f482a23 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 18 Jul 2023 18:11:04 +0200 Subject: [PATCH 06/96] [jthip] backport to CODEGEN from ggttgg.mad --- .../template_files/cpp_model_parameters_h.inc | 14 ++- .../iolibs/template_files/gpu/CudaRuntime.h | 85 ------------------- .../iolibs/template_files/gpu/check_sa.cc | 59 ++++++++----- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 73 ++++++++++------ .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 2 + .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h | 2 + 6 files changed, 96 insertions(+), 139 deletions(-) delete mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 0250c160ed..ef3d99d07c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -172,7 +172,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -194,9 +194,9 @@ namespace mg5amcCpu %(dcoupsetdcoup)s } %(eftspecial2)s - return out; - } -#ifdef __CUDACC__ + return out; + } +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif @@ -212,6 +212,12 @@ namespace mg5amcCpu //========================================================================== +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +#else +namespace mg5amcCpu +#endif +{ #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h deleted file mode 100644 index df0c3f3df8..0000000000 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef MGONGPUCPP_GPUIMPL -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 9a39220077..491dfc02e1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -102,12 +103,12 @@ main( int argc, char** argv ) CurandHost = 1, CurandDevice = 2 }; -#ifdef MGONGPUCPP_GPUIMPL - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU +#ifdef __CUDACC__ + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU #elif not defined MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #else - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // default on CPU if build has no curand and on HIP GPU #endif // Rambo sampling mode (NB RamboHost implies CommonRandom or CurandHost!) enum class RamboSamplingMode @@ -145,10 +146,10 @@ main( int argc, char** argv ) } else if( arg == "--curdev" ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ rndgen = RandomNumberMode::CurandDevice; #else - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" ); #endif } else if( arg == "--curhst" ) @@ -265,12 +266,12 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -394,7 +395,7 @@ main( int argc, char** argv ) const bool onDevice = false; prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); } -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ else { const bool onDevice = true; @@ -403,7 +404,7 @@ main( int argc, char** argv ) #else else { - throw std::logic_error( "CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "CurandDevice is not supported on CPUs or HIP GPUs" ); // INTERNAL ERROR (no path to this statement) } #endif #else @@ -729,17 +730,21 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? -#ifdef MGONGPUCPP_GPUIMPL + // -- CUDA or HIP or C++? +#ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; #endif @@ -754,7 +759,7 @@ main( int argc, char** argv ) wrkflwtxt += "???+"; // no path to this statement #endif // -- CUCOMPLEX or THRUST or STD complex numbers? -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; #elif defined MGONGPU_CUCXTYPE_THRUST @@ -764,6 +769,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -864,8 +875,10 @@ main( int argc, char** argv ) #endif // Dump all configuration parameters and all results std::cout << std::string( SEP79, '*' ) << std::endl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -892,14 +905,14 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) @@ -1033,14 +1046,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 5b04029787..1811de4699 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -6,21 +6,31 @@ #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 -#include "GpuRuntime.h" // Includes the GPU abstraction - // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) %(mgongpu_supports_multichannel)s +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For CUDA, by default, it is supported +// For HIP, by default, it is not supported // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -#ifdef MGONGPUCPP_CUDACC +#ifdef __CUDACC__ #undef MGONGPU_HAS_NO_CURAND -#elif defined MGONGPUCPP_HIPCC +#elif defined __HIPCC__ #define MGONGPU_HAS_NO_CURAND 1 +#else //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 #endif @@ -55,23 +65,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef MGONGPUCPP_GPUIMPL -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) -#ifdef MGONGPUCPP_GPUIMPL +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +#ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation -#ifdef MGONGPUCPP_GPUIMPL -#undef MGONGPU_NSIGHT_DEBUG // default +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +#ifdef __CUDACC__ +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -87,17 +102,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef MGONGPUCPP_GPUIMPL -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef MGONGPUCPP_GPUIMPL -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -146,7 +165,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef MGONGPUCPP_GPUIMPL // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -176,9 +195,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined MGONGPUCPP_GPUIMPL && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -190,7 +209,7 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ +// Define empty CUDA/HIP declaration specifiers for C++ #ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index cacab1031a..ed3e219f8a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -69,6 +69,8 @@ #ifdef __CUDACC__ #undef MGONGPU_NSIGHT_DEBUG // default //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index e1299ba81e..e91f5927d6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -9,6 +9,8 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" +#include "GpuAbstraction.h" + #include //========================================================================== From 0f1b8115d006f512a5081586916498f54fe1b90c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 18 Jul 2023 18:25:34 +0200 Subject: [PATCH 07/96] [jthip] complete backport to CODEGEN from ggttgg.mad, including a few improvements --- .../gpu/CurandRandomNumberKernel.cc | 4 +-- .../template_files/gpu/GpuAbstraction.h | 27 +++++-------------- .../iolibs/template_files/gpu/MemoryBuffers.h | 1 + .../template_files/gpu/mgOnGpuVectors.h | 2 -- .../iolibs/template_files/gpu/process_cc.inc | 1 - .../gpu/process_function_definitions.inc | 2 -- .../template_files/gpu/process_matrix.inc | 2 -- .../CUDACPP_SA_OUTPUT/model_handling.py | 20 +++++++------- .../SubProcesses/P1_gg_ttxgg/check_sa.cc | 4 ++- 9 files changed, 24 insertions(+), 39 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index 5b33207ad0..98ec214eaf 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef MGONGPUCPP_CUDACC +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef MGONGPUCPP_CUDACC +#ifdef __CUDACC__ if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 2f000e33d1..427c82c05d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -3,22 +3,10 @@ #include -#ifdef MGONGPUCPP_GPUIMPL -#define MGONGPUCPP_CUDACC 1 -#endif - -#ifdef __HIPCC__ -#include "hip/hip_runtime.h" -#define MGONGPUCPP_HIPCC 1 -#endif - -#ifdef MGONGPUCPP_CUDACC - -// Defines correct compiler -#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL - //-------------------------------------------------------------------------- +#ifdef __CUDACC__ + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -44,12 +32,9 @@ //-------------------------------------------------------------------------- -#elif defined MGONGPUCPP_HIPCC - -// Defines correct compiler -#define MGONGPUCPP_GPUIMPL __HCC__ +#elif defined __HIPCC__ -//-------------------------------------------------------------------------- +#include "hip/hip_runtime.h" #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError @@ -74,6 +59,8 @@ #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) #define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//-------------------------------------------------------------------------- + #endif -#endif // MG5AMC_GPUABSTRACTION_H \ No newline at end of file +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index d6ba45dcad..522e6ce100 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -11,6 +11,7 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" +#include "GpuRuntime.h" #include "Parameters_%(model_name)s.h" #include diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index de12c1d24f..fbfe68f6c1 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -9,8 +9,6 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" -#include "GpuAbstraction.h" - #include //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 9dceb45708..95400f42db 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -14,7 +14,6 @@ #include "mgOnGpuConfig.h" -#include "GpuRuntime.h" %(hel_amps_h)s #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index d4e999733f..aa8f899798 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -10,8 +10,6 @@ // Class member functions for calculating the matrix elements for %(process_lines)s -#include "GpuRuntime.h" - #ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 241c50a9d1..3cfbf668ca 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -7,8 +7,6 @@ ! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== -#include "GpuAbstraction.h" - // *** COLOR CHOICE BELOW *** // Store the leading color flows for choice of color if( jamp2_sv ) // disable color choice if nullptr diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 8bf85c5a55..abfd2428b6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1092,7 +1092,7 @@ def get_process_function_definitions(self, write=True): %(len(coupling_indep), ' ), cxmake( m_pars->'.join(coupling_indep)) # AV only indep! replace_dict['cipcdevice'] = '__device__ __constant__ fptype cIPC[%i];'%(2*len(coupling_indep)) replace_dict['cipcstatic'] = 'static fptype cIPC[%i];'%(2*len(coupling_indep)) - replace_dict['cipc2tipcSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) );'%len(coupling_indep) + replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipcdump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep) coup_str_hrd = '__device__ const fptype cIPC[%s] = { ' % (len(coupling_indep)*2) @@ -1103,7 +1103,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipcassign'] = '//const cxtype tIPC[0] = { ... }; // nicoup=0' replace_dict['cipcdevice'] = '__device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0' replace_dict['cipcstatic'] = 'static fptype* cIPC = nullptr; // unused as nicoup=0' - replace_dict['cipc2tipcSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) ); // nicoup=0'%len(coupling_indep) + replace_dict['cipc2tipcSym'] = '//gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep) replace_dict['cipc2tipc'] = '//memcpy( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep) replace_dict['cipcdump'] = '' replace_dict['cipchrdcod'] = '__device__ const fptype* cIPC = nullptr; // unused as nicoup=0' @@ -1112,7 +1112,7 @@ def get_process_function_definitions(self, write=True): %(len(params), ', (fptype)m_pars->'.join(params)) replace_dict['cipddevice'] = '__device__ __constant__ fptype cIPD[%i];'%(len(params)) replace_dict['cipdstatic'] = 'static fptype cIPD[%i];'%(len(params)) - replace_dict['cipd2tipdSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) );'%len(params) + replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipddump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params) param_str_hrd = '__device__ const fptype cIPD[%s] = { ' % len(params) @@ -1123,7 +1123,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipdassign'] = '//const fptype tIPD[0] = { ... }; // nparam=0' replace_dict['cipddevice'] = '//__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0' replace_dict['cipdstatic'] = '//static fptype* cIPD = nullptr; // unused as nparam=0' - replace_dict['cipd2tipdSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) ); // nparam=0'%len(params) + replace_dict['cipd2tipdSym'] = '//gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipddump'] = '' replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0' @@ -1195,13 +1195,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1228,7 +1228,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( \"calculate_wavefunctions: ihel=%2d\\n\", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( \"calculate_wavefunctions: ievt00=%d\\n\", ievt00 ); #endif""") nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() @@ -1265,7 +1265,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif""") ret_lines += helas_calls @@ -1665,8 +1665,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -1782,7 +1784,7 @@ def get_external(self, wf, argument): split_line2 = [ str.lstrip(' ').rstrip(' ') for str in split_line2] # AV split_line2.insert(2, '0') # add parameter fmass=0 line2 = ', '.join(split_line2) - text = '#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )\n %s\n#else\n if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n %s\n else\n %s\n#endif\n' # AV + text = '#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )\n %s\n#else\n if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n %s\n else\n %s\n#endif\n' # AV return text % (line, line, line2) text = '%s\n' # AV return text % line diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index f5f08dc64e..ceed439cb0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -740,6 +740,8 @@ main( int argc, char** argv ) // -- CUDA or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; #endif @@ -1040,7 +1042,7 @@ main( int argc, char** argv ) << "\"THRUST::COMPLEX\"," << std::endl #endif #else - << "\"STD::COMPLEX\"," << std::endl + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" From 71ff5e2e63d7ff1310f07604f32ea9980e424c46 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 18 Jul 2023 19:06:32 +0200 Subject: [PATCH 08/96] [jthip] in CODEGEN, remove the copying to src of GpuRuntime.h and GpuAbstraction.h --- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index a947f262b0..d97ab3b4de 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -86,7 +86,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', - s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'], + s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', From a37fb41ac45b3d66c42436d68467521d4b1f6281 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 18 Jul 2023 19:24:36 +0200 Subject: [PATCH 09/96] [jthip] In CODEGEN, acknowledge Joergen in each file and in COPYRIGHT/AUTHORS --- .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS | 2 +- .../madgraph/iolibs/template_files/COPYRIGHT | 1 + .../iolibs/template_files/cpp_model_parameters_cc.inc | 2 +- .../madgraph/iolibs/template_files/gpu/Bridge.h | 2 +- .../madgraph/iolibs/template_files/gpu/BridgeKernels.cc | 2 +- .../madgraph/iolibs/template_files/gpu/BridgeKernels.h | 2 +- .../iolibs/template_files/gpu/CommonRandomNumberKernel.cc | 2 +- .../iolibs/template_files/gpu/CrossSectionKernels.cc | 2 +- .../madgraph/iolibs/template_files/gpu/CrossSectionKernels.h | 2 +- .../iolibs/template_files/gpu/CurandRandomNumberKernel.cc | 2 +- .../madgraph/iolibs/template_files/gpu/EventStatistics.h | 2 +- .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h | 5 +++++ .../madgraph/iolibs/template_files/gpu/GpuRuntime.h | 5 +++++ .../madgraph/iolibs/template_files/gpu/MadgraphTest.h | 2 +- .../iolibs/template_files/gpu/MatrixElementKernels.cc | 2 +- .../iolibs/template_files/gpu/MatrixElementKernels.h | 2 +- .../madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h | 2 +- .../madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h | 2 +- .../iolibs/template_files/gpu/MemoryAccessRandomNumbers.h | 2 +- .../madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h | 2 +- .../madgraph/iolibs/template_files/gpu/MemoryBuffers.h | 2 +- .../iolibs/template_files/gpu/RamboSamplingKernels.cc | 2 +- .../iolibs/template_files/gpu/RamboSamplingKernels.h | 2 +- .../madgraph/iolibs/template_files/gpu/RandomNumberKernels.h | 2 +- .../madgraph/iolibs/template_files/gpu/check_sa.cc | 2 +- .../madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc | 2 +- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 2 +- .../madgraph/iolibs/template_files/gpu/fbridge.cc | 2 +- .../madgraph/iolibs/template_files/gpu/fsampler.cc | 2 +- .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h | 2 +- .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h | 2 +- .../madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h | 2 +- .../madgraph/iolibs/template_files/gpu/process_cc.inc | 2 +- .../template_files/gpu/process_function_definitions.inc | 2 +- .../madgraph/iolibs/template_files/gpu/process_h.inc | 2 +- .../madgraph/iolibs/template_files/gpu/process_matrix.inc | 2 +- .../iolibs/template_files/gpu/process_sigmaKin_function.inc | 2 +- .../madgraph/iolibs/template_files/gpu/rambo.h | 2 +- .../madgraph/iolibs/template_files/gpu/runTest.cc | 2 +- .../madgraph/iolibs/template_files/gpu/testmisc.cc | 2 +- .../madgraph/iolibs/template_files/gpu/testxxx.cc | 2 +- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 2 +- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +- 43 files changed, 51 insertions(+), 40 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS index 8541e954b9..0aeb2c8a87 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS @@ -10,6 +10,7 @@ generates includes the following authors: Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) @@ -28,5 +29,4 @@ acknowledged collaboration with the following collaborators: Taran Singhania (PES University Bangalore) David Smith (CERN) Carl Vuosalo (University of Wisconsin-Madison) - Joergen Teig (CERN) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc index 3c231bdbd6..05b664981d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index bcdfe29154..89437b4c42 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc index 6034db93ec..eaf4037a24 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h index 7c7feb692a..3efef8ce97 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc index f17b9c0ad7..010bc4cbd0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" #include "GpuAbstraction.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc index 36ca2a94d4..c15b39844d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h index ff2350a14d..4d9659e04e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index 98ec214eaf..38c477c17a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "GpuRuntime.h" #include "MemoryBuffers.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h index e7d7f3b3c3..b425a5bade 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 427c82c05d..6a7d9c05c0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h index 895a662e52..93579ef08b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -1,3 +1,8 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MG5AMC_GPURUNTIME_H #define MG5AMC_GPURUNTIME_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h index 3fa9f13a82..176338151a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index a9e20e114f..d6d6c4f179 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 4477a385ed..72bd8f195b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h index 67306c3922..db73e4e064 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h index dc4bb2aa22..d3f5a15bd2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h index 949a42066d..40cb089135 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h index a9ae26b6dc..08faccff0f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index 522e6ce100..f29b8c5357 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc index 8745b084d3..79abbcc4f8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h index fe63a7bb77..7c214cd74b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h index 0c215f2583..21d63beeac 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 491dfc02e1..734c2f83f8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc index 594fb770c5..b9840f1374 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for %(output_name)s by %(info_lines)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index f024f15ce7..699ce2c4e0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 592a8c74bb..22ce3f5115 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc index acffa7c19e..3743934f41 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 1811de4699..35ad042b75 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index b5e1f1a495..687d449117 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h index d9a955c235..83a46c1d4e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 95400f42db..815fd8d5b7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index aa8f899798..be10dba1de 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -4,7 +4,7 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== //========================================================================== // Class member functions for calculating the matrix elements for diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 8a9de336f2..2c3adf57e2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 3cfbf668ca..960f029d8d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -4,7 +4,7 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== // *** COLOR CHOICE BELOW *** diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 59c1623c5a..b84a96d6ec 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -4,7 +4,7 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== #include "GpuAbstraction.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h index 3a331b979a..cd7e1008ea 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index 6f20a7248a..7f8d6ffd12 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 5d00e2c06c..8f3480c45f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc index 6f8736c120..ffaf9ad005 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index abfd2428b6..1a38085af9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. import os diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index d97ab3b4de..585d065c39 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, A. Valassi, J. Teig, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin. import os From 428aa50b5d76514535c638b6aad2dbe56756396b Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 20 Jul 2023 14:41:55 +0200 Subject: [PATCH 10/96] [CODEGEN] Added HIP runtime include in mgOnGpuConfig.h in codegen --- .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h | 2 -- .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 35ad042b75..8da9429de8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" #else #undef MGONGPUCPP_GPUIMPL #endif From 24fbbb6067e21cbbcc057ecb7d5b11f6b89a8922 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 25 Jul 2023 16:59:59 +0200 Subject: [PATCH 11/96] [jthip/namespace] backport latest changes from ggttgg.mad to CODEGEN --- .../iolibs/template_files/cpp_model_parameters_cc.inc | 2 +- .../iolibs/template_files/cpp_model_parameters_h.inc | 6 +++--- .../iolibs/template_files/gpu/CurandRandomNumberKernel.cc | 4 ++-- .../iolibs/template_files/gpu/MemoryAccessAmplitudes.h | 2 +- .../iolibs/template_files/gpu/MemoryAccessCouplings.h | 2 +- .../template_files/gpu/MemoryAccessCouplingsFixed.h | 2 +- .../iolibs/template_files/gpu/MemoryAccessDenominators.h | 2 +- .../madgraph/iolibs/template_files/gpu/MemoryAccessGs.h | 2 +- .../template_files/gpu/MemoryAccessMatrixElements.h | 2 +- .../iolibs/template_files/gpu/MemoryAccessNumerators.h | 2 +- .../iolibs/template_files/gpu/MemoryAccessWavefunctions.h | 2 +- .../madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h | 8 ++++---- .../madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h | 2 +- .../madgraph/iolibs/template_files/gpu/testmisc.cc | 4 ++-- .../madgraph/iolibs/template_files/gpu/testxxx.cc | 6 +++--- 15 files changed, 24 insertions(+), 24 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc index 05b664981d..54ce4c64cf 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc @@ -15,7 +15,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index ef3d99d07c..5ab7aa7abd 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -25,7 +25,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +85,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -155,7 +155,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index 38c477c17a..08a16f6f2c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h index 1afc589b11..b4b76f3842 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index 687d449117..6ae0c42ecb 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -209,7 +209,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -249,7 +249,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -627,7 +627,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index fbfe68f6c1..cdae04326b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 8f3480c45f..ba9e59a8a3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc index ffaf9ad005..786cf10171 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; From 10df7037f3552bf533521cfb819907a07d584b57 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 25 Jul 2023 17:06:31 +0200 Subject: [PATCH 12/96] [jthip] in CODEGEN, backport also cudacpp_src.mk using GPUCC instead of NVCC --- .../iolibs/template_files/gpu/cudacpp_src.mk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index dac2e47d1d..f3a26552db 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -38,13 +38,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -246,20 +246,20 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi From 43e0c646fe37112ee40e0c6fac196f4d1414dce4 Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 10 Aug 2023 15:10:26 +0200 Subject: [PATCH 13/96] [CODEGEN] Added changes from gg_ttgg.mad to code generator --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 2 ++ .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk | 11 +++++++++-- .../iolibs/template_files/gpu/mgOnGpuCxtypes.h | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 699ce2c4e0..9fb389be2c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -216,6 +216,8 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) CUBUILDRULEFLAGS = -fPIC -c CCBUILDRULEFLAGS = -fPIC -c + export HIPARCHFLAGS + else ifneq ($(origin REQUIRE_HIP),undefined) # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index f3a26552db..d28c92ec13 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -85,6 +85,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -246,7 +253,7 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(GPUCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index 6ae0c42ecb..46d9f02733 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } From e99a2b87773025a9098e8ad3933cf408f01cedf2 Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 10 Aug 2023 15:44:56 +0200 Subject: [PATCH 14/96] [CODEGEN] Added export of GPUCC and GPUFLAGS to codegen --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 9fb389be2c..e1d691f1d6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -237,7 +237,9 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) endif - +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- #=== Configure ccache for C++ and CUDA builds From 4adb62fe5a480b7d5aec864e0fea21e466b1e76e Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 10 Aug 2023 17:20:16 +0200 Subject: [PATCH 15/96] Fixed warning and changed HIPARCHFLAGS export so it exports to cudacpp_src.mk --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 10 +++++----- epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index e1d691f1d6..b6703137aa 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -179,11 +179,11 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) #=== Configure the HIP compiler - # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) + # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") - override CUDA_HOME=disabled + $(warning HIP builds are not supported for multi-word CXX "$(CXX)") + override HIP_HOME=disabled endif # If HIP_HOME is not set, try to set it from the location of GPUCC @@ -216,8 +216,6 @@ else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) CUBUILDRULEFLAGS = -fPIC -c CCBUILDRULEFLAGS = -fPIC -c - export HIPARCHFLAGS - else ifneq ($(origin REQUIRE_HIP),undefined) # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) @@ -240,6 +238,8 @@ endif export GPUCC export GPUFLAGS +export HIPARCHFLAGS + #------------------------------------------------------------------------------- #=== Configure ccache for C++ and CUDA builds diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index d4a760a71b..904cb78a72 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; From e18c882d7612f24cf65aa1a63bf02628f55de2ab Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 10 Aug 2023 17:43:28 +0200 Subject: [PATCH 16/96] [CODEGEN] Fixed error in runTest.cc and reverted changes in cudacpp_src.mk and cudacpp.mk --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 2 -- .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk | 2 +- .../madgraph/iolibs/template_files/gpu/runTest.cc | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index b6703137aa..14ea0f52d6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -238,8 +238,6 @@ endif export GPUCC export GPUFLAGS -export HIPARCHFLAGS - #------------------------------------------------------------------------------- #=== Configure ccache for C++ and CUDA builds diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index d28c92ec13..7eda8524c0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -89,7 +89,7 @@ CXXFLAGS += $(OMPFLAGS) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += $(HIPARCHFLAGS) -DHIP_PLATFORM=amd -fPIC -c + GPUFLAGS += -fPIC -c endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index 7f8d6ffd12..de327f2321 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - gpuDeviceReset(); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; From 597de7394d2ce496d43c43bb1fef7dd310582f6c Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Thu, 13 Jul 2023 15:15:41 +0200 Subject: [PATCH 17/96] [CODEGEN] Added GPU abstraction to CODEGEN --- .../iolibs/template_files/gpu/check_sa.cc | 20 +++++++++-------- .../iolibs/template_files/gpu/cudacpp.mk | 22 +++++++++---------- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 4 +++- .../template_files/gpu/mgOnGpuVectors.h | 2 ++ .../template_files/gpu/process_matrix.inc | 2 ++ .../PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +- 6 files changed, 30 insertions(+), 22 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 734c2f83f8..611db19653 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -103,8 +103,8 @@ main( int argc, char** argv ) CurandHost = 1, CurandDevice = 2 }; -#ifdef __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on CUDA GPU +#ifdef MGONGPUCPP_GPUIMPL + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU #elif not defined MGONGPU_HAS_NO_CURAND RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #else @@ -146,7 +146,7 @@ main( int argc, char** argv ) } else if( arg == "--curdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rndgen = RandomNumberMode::CurandDevice; #else throw std::runtime_error( "CurandDevice is not supported on CPUs or on HIP GPUs" ); @@ -395,7 +395,7 @@ main( int argc, char** argv ) const bool onDevice = false; prnk.reset( new CurandRandomNumberKernel( hstRndmom, onDevice ) ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL else { const bool onDevice = true; @@ -730,7 +730,7 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ rndgentxt += " (HIP code)"; @@ -740,8 +740,8 @@ main( int argc, char** argv ) // Workflow description summary std::string wrkflwtxt; - // -- CUDA or HIP or C++? -#ifdef __CUDACC__ + // -- CUDA or C++? +#ifdef MGONGPUCPP_GPUIMPL wrkflwtxt += "CUD:"; #elif defined __HIPCC__ wrkflwtxt += "HIP:"; @@ -759,7 +759,7 @@ main( int argc, char** argv ) wrkflwtxt += "???+"; // no path to this statement #endif // -- CUCOMPLEX or THRUST or STD complex numbers? -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; #elif defined MGONGPU_CUCXTYPE_THRUST @@ -875,7 +875,7 @@ main( int argc, char** argv ) #endif // Dump all configuration parameters and all results std::cout << std::string( SEP79, '*' ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" #elif defined __HIPCC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" @@ -905,6 +905,7 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST @@ -1046,6 +1047,7 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 401868a61c..965c0e36bf 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -262,11 +262,11 @@ endif # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) - CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4 + CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4 # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change ###CXXFLAGS+= -fpeel-loops # no change - ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4 + ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4 ###CXXFLAGS+= -ftree-vectorize # no change ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6! else @@ -547,7 +547,7 @@ $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ -$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif @@ -555,7 +555,7 @@ endif # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ @@ -644,7 +644,7 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): Fortran include files -###$(INCDIR)/%%.inc : ../%%.inc +###$(INCDIR)/%.inc : ../%.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### \cp $< $@ @@ -660,7 +660,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),) $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o @@ -670,12 +670,12 @@ endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%%.o : %%.f *.inc +$(BUILDDIR)/%.o : %.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%%.o : %%.f *.inc +###$(BUILDDIR)/%.o : %.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ @@ -744,7 +744,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),) $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(testmain): $(BUILDDIR)/runTest_cu.o $(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o @@ -928,14 +928,14 @@ cmpFcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events) cmpFGcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) memcheck: all.$(TAG) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 8da9429de8..3fa1fff9a3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -6,6 +6,8 @@ #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 +#include "GpuRuntime.h" // Includes the GPU abstraction + // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) %(mgongpu_supports_multichannel)s @@ -27,7 +29,7 @@ // For CUDA, by default, it is supported // For HIP, by default, it is not supported // For C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_CUDACC #undef MGONGPU_HAS_NO_CURAND #elif defined __HIPCC__ #define MGONGPU_HAS_NO_CURAND 1 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index cdae04326b..dd8b83752d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -9,6 +9,8 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" +#include "GpuAbstraction.h" + #include //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 960f029d8d..84e324a679 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -7,6 +7,8 @@ ! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== +#include "GpuAbstraction.h" + // *** COLOR CHOICE BELOW *** // Store the leading color flows for choice of color if( jamp2_sv ) // disable color choice if nullptr diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index ed7b1985dd..e08746a1b6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -88,7 +88,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', - s+'CMake/src/CMakeLists.txt' ], + s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', From 1a6496ab3b9bdb5bbc03b3790b2ed06f23749e30 Mon Sep 17 00:00:00 2001 From: Jorgen T Date: Fri, 29 Sep 2023 16:25:12 +0200 Subject: [PATCH 18/96] Updated first name in Author list --- .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS index 0aeb2c8a87..71519d1ad8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS @@ -10,7 +10,7 @@ generates includes the following authors: Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) From d2e2f47a303a9b1c25805d96d04beb4f07b57575 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 10:16:55 +0100 Subject: [PATCH 19/96] [jt774] (before merging upstream/master) improve logic of "if CUDA else HIP else neither" in CODEGEN cudacpp.mk --- .../iolibs/template_files/gpu/cudacpp.mk | 228 +++++++++--------- 1 file changed, 108 insertions(+), 120 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 965c0e36bf..2864673ead 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -30,7 +30,7 @@ UNAME_P := $(shell uname -p) include ../../Source/make_opts #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here @@ -104,69 +104,73 @@ endif #------------------------------------------------------------------------------- -CUDA_COMPILER_PATH := $(shell compiler="`which nvcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler") -HIP_COMPILER_PATH := $(shell compiler="`which hipcc 2>/dev/null`" && while [ -L "$$compiler" ]; do compiler=`readlink "$$compiler"`; done && echo "$$compiler") - -ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) - #=== Configure the CUDA compiler - - # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) - # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below - ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") - override CUDA_HOME=disabled - endif - - # If CUDA_HOME is not set, try to set it from the location of NVCC - ifndef CUDA_HOME - CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) - $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") - endif - - # Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists - ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - GPUCC = $(CUDA_HOME)/bin/nvcc - USE_NVTX ?=-DUSE_NVTX - # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html - # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # Embed device code for 70, and PTX for 70+. - # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). - # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 - comma:=, - CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) - CUINC = -I$(CUDA_HOME)/include/ - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - CUOPTFLAGS = -lineinfo - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h - # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) - - CUBUILDRULEFLAGS = -Xcompiler -fPIC -c - CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu - - CUDATESTFLAGS = -lcuda - - else ifneq ($(origin REQUIRE_CUDA),undefined) - # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) - else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ - $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override GPUCC= - override USE_NVTX= - override CUINC= - override CURANDLIBFLAGS= - endif +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below +ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + override CUDA_HOME=disabled + override HIP_HOME=disabled +endif + +# If CUDA_HOME is not set, try to set it from the path to nvcc +ifndef CUDA_HOME + CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) + $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") +endif + +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists +ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) + + GPUCC = $(CUDA_HOME)/bin/nvcc + USE_NVTX ?=-DUSE_NVTX + # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ + # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). + # Embed device code for 70, and PTX for 70+. + # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). + # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). + MADGRAPH_CUDA_ARCHITECTURE ?= 70 + ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + comma:=, + CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) + CUINC = -I$(CUDA_HOME)/include/ + CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! + CUOPTFLAGS = -lineinfo + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda # Set the host C++ compiler for GPUCC via "-ccbin " # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) @@ -177,71 +181,55 @@ ifeq ($(findstring nvcc,$(CUDA_COMPILER_PATH)),nvcc) GPUFLAGS += -allow-unsupported-compiler endif -else ifeq ($(findstring hipcc,$(HIP_COMPILER_PATH)),hipcc) - #=== Configure the HIP compiler +else ifneq ($(origin REQUIRE_CUDA),undefined) - # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA/HIP builds (issue #505) - # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below - ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning HIP builds are not supported for multi-word CXX "$(CXX)") - override HIP_HOME=disabled - endif + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) - # If HIP_HOME is not set, try to set it from the location of GPUCC - ifndef HIP_HOME - HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) - $(warning HIP_HOME was not set: using "$(HIP_HOME)") - endif +#--- Option 2: CUDA does not exist, HIP exists -> use HIP - # Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists - ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) - GPUCC = $(HIP_HOME)/bin/hipcc - - # Should maybe find something equivelant to this in HIP - #USE_NVTX ?=-DUSE_NVTX - - HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a - HIPINC = -I$(HIP_HOME)/include/ - - # -DHIP_FAST_MATH equivelant to -use_fast_math in HIP - # (But only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC - ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - GPUFLAGS += -std=c++17 - # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) - - CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c - - else ifneq ($(origin REQUIRE_HIP),undefined) - # If REQUIRE_HIP is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) - else - # No hip. Switch hip compilation off and go to common random numbers in C++ - $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) - override GPUCC= - override USE_NVTX= - override CUINC= - override CURANDLIBFLAGS= - endif +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) - # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) - ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) - GPUFLAGS += -allow-unsupported-compiler - endif + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + +else + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ + $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= + override USE_NVTX= + override CUINC= + override CURANDLIBFLAGS= endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) export GPUCC export GPUFLAGS #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -258,7 +246,7 @@ endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -274,7 +262,7 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) GPUFLAGS+= -Xcompiler -mno-float128 endif @@ -360,7 +348,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) From 8e9120cc2ac6f7bc12bf5a8b9a3caec6e0311f93 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 10:18:06 +0100 Subject: [PATCH 20/96] [jt774] (before merging usptream/master) remove CODEGEN #cudacpp.mk# --- .../iolibs/template_files/gpu/#cudacpp.mk# | 867 ------------------ 1 file changed, 867 deletions(-) delete mode 100644 epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk# diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk# b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk# deleted file mode 100644 index e238257ab6..0000000000 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/#cudacpp.mk# +++ /dev/null @@ -1,867 +0,0 @@ -# Copyright (C) 2020-2023 CERN and UCLouvain. -# Licensed under the GNU Lesser General Public License (version 3 or later). -# Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) -#=== NB: different names (e.g. cudacpp.mk and cudacpp_src.mk) are used in the Subprocess and src directories - -CUDACPP_MAKEFILE = $(word $(words $(MAKEFILE_LIST)),$(MAKEFILE_LIST)) -CUDACPP_SRC_MAKEFILE = cudacpp_src.mk - -#------------------------------------------------------------------------------- - -#=== Use bash in the Makefile (https://www.gnu.org/software/make/manual/html_node/Choosing-the-Shell.html) - -SHELL := /bin/bash - -#------------------------------------------------------------------------------- - -#=== Detect O/S and architecture (assuming uname is available, https://en.wikipedia.org/wiki/Uname) - -# Detect O/S kernel (Linux, Darwin...) -UNAME_S := $(shell uname -s) -###$(info UNAME_S='$(UNAME_S)') - -# Detect architecture (x86_64, ppc64le...) -UNAME_P := $(shell uname -p) -###$(info UNAME_P='$(UNAME_P)') - -include ../../Source/make_opts -#------------------------------------------------------------------------------- - -#=== Configure common compiler flags for C++ and CUDA - -INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here - -# Dependency on src directory -MG5AMC_COMMONLIB = mg5amc_common -LIBFLAGS = -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -INCFLAGS += -I../../src - -# Compiler-specific googletest build directory (#125 and #738) -ifneq ($(shell $(CXX) --version | grep '^Intel(R) oneAPI DPC++/C++ Compiler'),) -override CXXNAME = icpx$(shell $(CXX) --version | head -1 | cut -d' ' -f5) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -override CXXNAME = clang$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else ifneq ($(shell $(CXX) --version | grep '^g++ (GCC)'),) -override CXXNAME = gcc$(shell $(CXX) --version | head -1 | cut -d' ' -f3) -else -override CXXNAME = unknown -endif -###$(info CXXNAME=$(CXXNAME)) -override CXXNAMESUFFIX = _$(CXXNAME) -export CXXNAMESUFFIX - -# Dependency on test directory -# Within the madgraph4gpu git repo: by default use a common gtest installation in /test (optionally use an external or local gtest) -# Outside the madgraph4gpu git repo: by default do not build the tests (optionally use an external or local gtest) -###GTEST_ROOT = /cvmfs/sft.cern.ch/lcg/releases/gtest/1.11.0-21e8c/x86_64-centos8-gcc11-opt/# example of an external gtest installation -###LOCALGTEST = yes# comment this out (or use make LOCALGTEST=yes) to build tests using a local gtest installation -TESTDIRCOMMON = ../../../../../test -TESTDIRLOCAL = ../../test -ifneq ($(wildcard $(GTEST_ROOT)),) -TESTDIR = -else ifneq ($(LOCALGTEST),) -TESTDIR=$(TESTDIRLOCAL) -GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else ifneq ($(wildcard ../../../../../epochX/cudacpp/CODEGEN),) -TESTDIR = $(TESTDIRCOMMON) -GTEST_ROOT = $(TESTDIR)/googletest/install$(CXXNAMESUFFIX) -else -TESTDIR = -endif -ifneq ($(GTEST_ROOT),) -GTESTLIBDIR = $(GTEST_ROOT)/lib64/ -GTESTLIBS = $(GTESTLIBDIR)/libgtest.a $(GTESTLIBDIR)/libgtest_main.a -GTESTINC = -I$(GTEST_ROOT)/include -else -GTESTLIBDIR = -GTESTLIBS = -GTESTINC = -endif -###$(info GTEST_ROOT = $(GTEST_ROOT)) -###$(info LOCALGTEST = $(LOCALGTEST)) -###$(info TESTDIR = $(TESTDIR)) - -#------------------------------------------------------------------------------- - -#=== Configure the C++ compiler - -CXXFLAGS = $(OPTFLAGS) -std=c++17 $(INCFLAGS) -Wall -Wshadow -Wextra -ifeq ($(shell $(CXX) --version | grep ^nvc++),) -CXXFLAGS += -ffast-math # see issue #117 -endif -###CXXFLAGS+= -Ofast # performance is not different from --fast-math -###CXXFLAGS+= -g # FOR DEBUGGING ONLY - -# Optionally add debug flags to display the full list of flags (eg on Darwin) -###CXXFLAGS+= -v - -# Note: AR, CXX and FC are implicitly defined if not set externally -# See https://www.gnu.org/software/make/manual/html_node/Implicit-Variables.html - -#------------------------------------------------------------------------------- - -#=== Configure the CUDA compiler - -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below -ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") - override CUDA_HOME=disabled -endif - -# If CUDA_HOME is not set, try to set it from the location of nvcc -ifndef CUDA_HOME - CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) - $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") -endif - -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists -ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc - USE_NVTX ?=-DUSE_NVTX - # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html - # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ - # Default: use compute capability 70 for V100 (CERN lxbatch, CERN itscrd, Juwels Cluster). - # Embed device code for 70, and PTX for 70+. - # Export MADGRAPH_CUDA_ARCHITECTURE (comma-separated list) to use another value or list of values (see #533). - # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). - MADGRAPH_CUDA_ARCHITECTURE ?= 70 - ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 - comma:=, - CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) - CUINC = -I$(CUDA_HOME)/include/ - CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! - CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h - # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) -else ifneq ($(origin REQUIRE_CUDA),undefined) - # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) -else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ - $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= - override USE_NVTX= - override CUINC= - override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler -endif - -#------------------------------------------------------------------------------- - -#=== Configure ccache for C++ and CUDA builds - -# Enable ccache if USECCACHE=1 -ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) - override CXX:=ccache $(CXX) -endif -#ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) -# override AR:=ccache $(AR) -#endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) - endif -endif - -#------------------------------------------------------------------------------- - -#=== Configure PowerPC-specific compiler flags for C++ and CUDA - -# PowerPC-specific CXX compiler flags (being reviewed) -ifeq ($(UNAME_P),ppc64le) - CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4 - # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 - ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change - ###CXXFLAGS+= -fpeel-loops # no change - ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4 - ###CXXFLAGS+= -ftree-vectorize # no change - ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6! -else - ###CXXFLAGS+= -flto # also on Intel this would increase throughputs by a factor 2 to 4... - ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) -endif - -# PowerPC-specific CUDA compiler flags (to be reviewed!) -ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 -endif - -#------------------------------------------------------------------------------- - -#=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN - -# Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) -else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) -override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) -else ifneq ($(shell $(CXX) --version | egrep '^(Apple clang)'),) -override OMPFLAGS = -fopenmp # disable OpenMP MT on Apple clang (builds fail in the CI #578) -else -override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT (default before #575) -endif - -# Set the default AVX (vectorization) choice -ifeq ($(AVX),) - ifeq ($(UNAME_P),ppc64le) - ###override AVX = none - override AVX = sse4 - else ifeq ($(UNAME_P),arm) - ###override AVX = none - override AVX = sse4 - else ifeq ($(wildcard /proc/cpuinfo),) - override AVX = none - $(warning Using AVX='$(AVX)' because host SIMD features cannot be read from /proc/cpuinfo) - else ifeq ($(shell grep -m1 -c avx512vl /proc/cpuinfo)$(shell $(CXX) --version | grep ^clang),1) - override AVX = 512y - ###$(info Using AVX='$(AVX)' as no user input exists) - else - override AVX = avx2 - ifneq ($(shell grep -m1 -c avx512vl /proc/cpuinfo),1) - $(warning Using AVX='$(AVX)' because host does not support avx512vl) - else - $(warning Using AVX='$(AVX)' because this is faster than avx512vl for clang) - endif - endif -else - ###$(info Using AVX='$(AVX)' according to user input) -endif - -# Set the default FPTYPE (floating point type) choice -ifeq ($(FPTYPE),) - override FPTYPE = d -endif - -# Set the default HELINL (inline helicities?) choice -ifeq ($(HELINL),) - override HELINL = 0 -endif - -# Set the default HRDCOD (hardcode cIPD physics parameters?) choice -ifeq ($(HRDCOD),) - override HRDCOD = 0 -endif - -# Set the default RNDGEN (random number generator) choice -ifeq ($(RNDGEN),) - ifeq ($(NVCC),) - override RNDGEN = hasNoCurand - else ifeq ($(RNDGEN),) - override RNDGEN = hasCurand - endif -endif - -# Export AVX, FPTYPE, HELINL, HRDCOD, RNDGEN, OMPFLAGS so that it is not necessary to pass them to the src Makefile too -export AVX -export FPTYPE -export HELINL -export HRDCOD -export RNDGEN -export OMPFLAGS - -#------------------------------------------------------------------------------- - -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN - -# Set the build flags appropriate to OMPFLAGS -$(info OMPFLAGS=$(OMPFLAGS)) -CXXFLAGS += $(OMPFLAGS) - -# Set the build flags appropriate to each AVX choice (example: "make AVX=none") -# [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] -# [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] -$(info AVX=$(AVX)) -ifeq ($(UNAME_P),ppc64le) - ifeq ($(AVX),sse4) - override AVXFLAGS = -D__SSE4_2__ # Power9 VSX with 128 width (VSR registers) - else ifneq ($(AVX),none) - $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on PowerPC for the moment) - endif -else ifeq ($(UNAME_P),arm) - ifeq ($(AVX),sse4) - override AVXFLAGS = -D__SSE4_2__ # ARM NEON with 128 width (Q/quadword registers) - else ifneq ($(AVX),none) - $(error Unknown AVX='$(AVX)': only 'none' and 'sse4' are supported on ARM for the moment) - endif -else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 - ifeq ($(AVX),none) - override AVXFLAGS = -mno-sse3 # no SIMD - else ifeq ($(AVX),sse4) - override AVXFLAGS = -mno-avx # SSE4.2 with 128 width (xmm registers) - else ifeq ($(AVX),avx2) - override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] - else ifeq ($(AVX),512y) - override AVXFLAGS = -march=skylake -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] - else ifeq ($(AVX),512z) - override AVXFLAGS = -march=skylake -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else - $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) - endif -else - ifeq ($(AVX),none) - override AVXFLAGS = -march=x86-64 # no SIMD (see #588) - else ifeq ($(AVX),sse4) - override AVXFLAGS = -march=nehalem # SSE4.2 with 128 width (xmm registers) - else ifeq ($(AVX),avx2) - override AVXFLAGS = -march=haswell # AVX2 with 256 width (ymm registers) [DEFAULT for clang] - else ifeq ($(AVX),512y) - override AVXFLAGS = -march=skylake-avx512 -mprefer-vector-width=256 # AVX512 with 256 width (ymm registers) [DEFAULT for gcc] - else ifeq ($(AVX),512z) - override AVXFLAGS = -march=skylake-avx512 -DMGONGPU_PVW512 # AVX512 with 512 width (zmm registers) - else - $(error Unknown AVX='$(AVX)': only 'none', 'sse4', 'avx2', '512y' and '512z' are supported) - endif -endif -# For the moment, use AVXFLAGS everywhere: eventually, use them only in encapsulated implementations? -CXXFLAGS+= $(AVXFLAGS) - -# Set the build flags appropriate to each FPTYPE choice (example: "make FPTYPE=f") -$(info FPTYPE=$(FPTYPE)) -ifeq ($(FPTYPE),d) - CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE -else ifeq ($(FPTYPE),f) - CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT -else ifeq ($(FPTYPE),m) - CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT -else - $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) -endif - -# Set the build flags appropriate to each HELINL choice (example: "make HELINL=1") -$(info HELINL=$(HELINL)) -ifeq ($(HELINL),1) - CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS -else ifneq ($(HELINL),0) - $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) -endif - -# Set the build flags appropriate to each HRDCOD choice (example: "make HRDCOD=1") -$(info HRDCOD=$(HRDCOD)) -ifeq ($(HRDCOD),1) - CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM -else ifneq ($(HRDCOD),0) - $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) -endif - -# Set the build flags appropriate to each RNDGEN choice (example: "make RNDGEN=hasNoCurand") -$(info RNDGEN=$(RNDGEN)) -ifeq ($(RNDGEN),hasNoCurand) - override CXXFLAGSCURAND = -DMGONGPU_HAS_NO_CURAND -else ifeq ($(RNDGEN),hasCurand) - override CXXFLAGSCURAND = -else - $(error Unknown RNDGEN='$(RNDGEN)': only 'hasCurand' and 'hasNoCurand' are supported) -endif - -#------------------------------------------------------------------------------- - -#=== Configure build directories and build lockfiles === - -# Build directory "short" tag (defines target and path to the optional build directory) -# (Rationale: keep directory names shorter, e.g. do not include random number generator choice) -override DIRTAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD) - -# Build lockfile "full" tag (defines full specification of build options that cannot be intermixed) -# (Rationale: avoid mixing of CUDA and no-CUDA environment builds with different random number generators) -override TAG = $(AVX)_$(FPTYPE)_inl$(HELINL)_hrd$(HRDCOD)_$(RNDGEN) - -# Build directory: current directory by default, or build.$(DIRTAG) if USEBUILDDIR==1 -ifeq ($(USEBUILDDIR),1) - override BUILDDIR = build.$(DIRTAG) - override LIBDIR = ../../lib/$(BUILDDIR) - override LIBDIRRPATH = '$$ORIGIN/../$(LIBDIR)' - $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is set = 1)) -else - override BUILDDIR = . - override LIBDIR = ../../lib - override LIBDIRRPATH = '$$ORIGIN/$(LIBDIR)' - $(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG) (USEBUILDDIR is not set)) -endif -###override INCDIR = ../../include -###$(info Building in BUILDDIR=$(BUILDDIR) for tag=$(TAG)) - -# On Linux, set rpath to LIBDIR to make it unnecessary to use LD_LIBRARY_PATH -# Use relative paths with respect to the executables or shared libraries ($ORIGIN on Linux) -# On Darwin, building libraries with absolute paths in LIBDIR makes this unnecessary -ifeq ($(UNAME_S),Darwin) - override CXXLIBFLAGSRPATH = - override CULIBFLAGSRPATH = - override CXXLIBFLAGSRPATH2 = - override CULIBFLAGSRPATH2 = -else - # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) - # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' -endif - -# Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) -override RUNTIME = - -#=============================================================================== -#=== Makefile TARGETS and build rules below -#=============================================================================== - -cxx_main=$(BUILDDIR)/check.exe -fcxx_main=$(BUILDDIR)/fcheck.exe - -ifneq ($(NVCC),) -cu_main=$(BUILDDIR)/gcheck.exe -fcu_main=$(BUILDDIR)/fgcheck.exe -else -cu_main= -fcu_main= -endif - -testmain=$(BUILDDIR)/runTest.exe - -ifneq ($(GTESTLIBS),) -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(fcu_main) $(fcxx_main) $(testmain) -else -all.$(TAG): $(BUILDDIR)/.build.$(TAG) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_main) $(cxx_main) $(fcu_main) $(fcxx_main) -endif - -# Target (and build options): debug -MAKEDEBUG= -debug: OPTFLAGS = -g -O0 -debug: CUOPTFLAGS = -G -debug: MAKEDEBUG := debug -debug: all.$(TAG) - -# Target: tag-specific build lockfiles -override oldtagsb=`if [ -d $(BUILDDIR) ]; then find $(BUILDDIR) -maxdepth 1 -name '.build.*' ! -name '.build.$(TAG)' -exec echo $(shell pwd)/{} \; ; fi` -$(BUILDDIR)/.build.$(TAG): - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - @if [ "$(oldtagsb)" != "" ]; then echo "Cannot build for tag=$(TAG) as old builds exist for other tags:"; echo " $(oldtagsb)"; echo "Please run 'make clean' first\nIf 'make clean' is not enough: run 'make clean USEBUILDDIR=1 AVX=$(AVX) FPTYPE=$(FPTYPE)' or 'make cleanall'"; exit 1; fi - @touch $(BUILDDIR)/.build.$(TAG) - -# Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) -$(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ - -$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ -endif - -# Generic target and build rules: objects from C++ compilation -# (NB do not include CUINC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ - -# Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) -ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math -endif -endif - -# Apply special build flags only to check_sa.o and gcheck_sa.o (NVTX in timermap.h, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) -$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(USE_NVTX) $(CUINC) - -# Apply special build flags only to check_sa and CurandRandomNumberKernel (curand headers, #679) -$(BUILDDIR)/check_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/gcheck_sa.o: CXXFLAGS += $(CXXFLAGSCURAND) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CXXFLAGSCURAND) -ifeq ($(RNDGEN),hasCurand) -$(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) -endif - -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) -ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins -endif -endif - -# Avoid clang warning "overriding '-ffp-contract=fast' option with '-ffp-contract=on'" (#516) -# This patch does remove the warning, but I prefer to keep it disabled for the moment... -###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) -###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option -###endif -###endif - -#### Apply special build flags only to CPPProcess.cc (-flto) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += -flto - -#### Apply special build flags only to CPPProcess.cc (AVXFLAGS) -###$(BUILDDIR)/CPPProcess.o: CXXFLAGS += $(AVXFLAGS) - -#------------------------------------------------------------------------------- - -# Target (and build rules): common (src) library -commonlib : $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so - -$(LIBDIR)/lib$(MG5AMC_COMMONLIB).so: ../../src/*.h ../../src/*.cc $(BUILDDIR)/.build.$(TAG) - $(MAKE) -C ../../src $(MAKEDEBUG) -f $(CUDACPP_SRC_MAKEFILE) - -#------------------------------------------------------------------------------- - -processid_short=$(shell basename $(CURDIR) | awk -F_ '{print $$(NF-1)"_"$$NF}') -###$(info processid_short=$(processid_short)) - -MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp -cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o -cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o - -ifneq ($(NVCC),) -MG5AMC_CULIB = mg5amc_$(processid_short)_cuda -cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o -cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o -endif - -# Target (and build rules): C++ and CUDA shared libraries -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o -$(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) - $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) ) - -ifneq ($(NVCC),) -$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o -$(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o -$(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -endif - -#------------------------------------------------------------------------------- - -# Target (and build rules): Fortran include files -###$(INCDIR)/%%.inc : ../%%.inc -### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi -### \cp $< $@ - -#------------------------------------------------------------------------------- - -# Target (and build rules): C++ and CUDA standalone executables -$(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o - $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) - -ifneq ($(NVCC),) -ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') -else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc -endif -$(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) -endif - -#------------------------------------------------------------------------------- - -# Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%%.o : %%.f *.inc - @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(FC) -I. -c $< -o $@ - -# Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%%.o : %%.f *.inc -### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi -### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi -### $(FC) -I. -I$(INCDIR) -c $< -o $@ - -# Target (and build rules): Fortran standalone executables -###$(BUILDDIR)/fcheck_sa.o : $(INCDIR)/fbridge.inc - -ifeq ($(UNAME_S),Darwin) -$(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 -endif -$(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) - $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) - -ifneq ($(NVCC),) -ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') -endif -ifeq ($(UNAME_S),Darwin) -$(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 -endif -$(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -endif - -#------------------------------------------------------------------------------- - -# Target (and build rules): test objects and test executable -$(BUILDDIR)/testxxx.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions - -ifneq ($(NVCC),) -$(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) -$(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) -$(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt -$(testmain): $(BUILDDIR)/testxxx_cu.o -$(testmain): cu_objects_exe += $(BUILDDIR)/testxxx_cu.o # Comment out this line to skip the CUDA test of xxx functions -endif - -$(BUILDDIR)/testmisc.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests - -ifneq ($(NVCC),) -$(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) -$(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/testmisc_cu.o -$(testmain): cu_objects_exe += $(BUILDDIR)/testmisc_cu.o # Comment out this line to skip the CUDA miscellaneous tests -endif - -$(BUILDDIR)/runTest.o: $(GTESTLIBS) -$(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) -$(testmain): $(BUILDDIR)/runTest.o -$(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o - -ifneq ($(NVCC),) -$(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) -$(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) -ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') -else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc -endif -$(testmain): $(BUILDDIR)/runTest_cu.o -$(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o -endif - -$(testmain): $(GTESTLIBS) -$(testmain): INCFLAGS += $(GTESTINC) -$(testmain): LIBFLAGS += -L$(GTESTLIBDIR) -lgtest -lgtest_main - -ifneq ($(OMPFLAGS),) -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) -$(testmain): LIBFLAGS += -liomp5 # see #578 (not '-qopenmp -static-intel' as in https://stackoverflow.com/questions/45909648) -else ifneq ($(shell $(CXX) --version | egrep '^clang'),) -$(testmain): LIBFLAGS += -L $(shell dirname $(shell $(CXX) -print-file-name=libc++.so)) -lomp # see #604 -###else ifneq ($(shell $(CXX) --version | egrep '^Apple clang'),) -###$(testmain): LIBFLAGS += ???? # OMP is not supported yet by cudacpp for Apple clang (see #578 and #604) -else -$(testmain): LIBFLAGS += -lgomp -endif -endif - -ifeq ($(NVCC),) # link only runTest.o -$(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) - $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) -else # link both runTest.o and runTest_cu.o -$(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH -$(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda -endif - -# Use flock (Linux only, no Mac) to allow 'make -j' if googletest has not yet been downloaded https://stackoverflow.com/a/32666215 -$(GTESTLIBS): -ifneq ($(shell which flock 2>/dev/null),) - flock $(BUILDDIR)/.make_test.lock $(MAKE) -C $(TESTDIR) -else - $(MAKE) -C $(TESTDIR) -endif - -#------------------------------------------------------------------------------- - -# Target: build all targets in all AVX modes (each AVX mode in a separate build directory) -# Split the avxall target into five separate targets to allow parallel 'make -j avxall' builds -# (Hack: add a fbridge.inc dependency to avxall, to ensure it is only copied once for all AVX modes) -avxnone: - @echo - $(MAKE) USEBUILDDIR=1 AVX=none -f $(CUDACPP_MAKEFILE) - -avxsse4: - @echo - $(MAKE) USEBUILDDIR=1 AVX=sse4 -f $(CUDACPP_MAKEFILE) - -avxavx2: - @echo - $(MAKE) USEBUILDDIR=1 AVX=avx2 -f $(CUDACPP_MAKEFILE) - -avx512y: - @echo - $(MAKE) USEBUILDDIR=1 AVX=512y -f $(CUDACPP_MAKEFILE) - -avx512z: - @echo - $(MAKE) USEBUILDDIR=1 AVX=512z -f $(CUDACPP_MAKEFILE) - -ifeq ($(UNAME_P),ppc64le) -###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 -avxall: avxnone avxsse4 -else ifeq ($(UNAME_P),arm) -###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 -avxall: avxnone avxsse4 -else -###avxall: $(INCDIR)/fbridge.inc avxnone avxsse4 avxavx2 avx512y avx512z -avxall: avxnone avxsse4 avxavx2 avx512y avx512z -endif - -#------------------------------------------------------------------------------- - -# Target: clean the builds -.PHONY: clean - -clean: -ifeq ($(USEBUILDDIR),1) - rm -rf $(BUILDDIR) -else - rm -f $(BUILDDIR)/.build.* $(BUILDDIR)/*.o $(BUILDDIR)/*.exe - rm -f $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(LIBDIR)/lib$(MG5AMC_CULIB).so -endif - $(MAKE) -C ../../src clean -f $(CUDACPP_SRC_MAKEFILE) -### rm -rf $(INCDIR) - -cleanall: - @echo - $(MAKE) USEBUILDDIR=0 clean -f $(CUDACPP_MAKEFILE) - @echo - $(MAKE) USEBUILDDIR=0 -C ../../src cleanall -f $(CUDACPP_SRC_MAKEFILE) - rm -rf build.* - -# Target: clean the builds as well as the gtest installation(s) -distclean: cleanall -ifneq ($(wildcard $(TESTDIRCOMMON)),) - $(MAKE) -C $(TESTDIRCOMMON) clean -endif - $(MAKE) -C $(TESTDIRLOCAL) clean - -#------------------------------------------------------------------------------- - -# Target: show system and compiler information -info: - @echo "" - @uname -spn # e.g. Linux nodename.cern.ch x86_64 -ifeq ($(UNAME_S),Darwin) - @sysctl -a | grep -i brand - @sysctl -a | grep machdep.cpu | grep features || true - @sysctl -a | grep hw.physicalcpu: - @sysctl -a | grep hw.logicalcpu: -else - @cat /proc/cpuinfo | grep "model name" | sort -u - @cat /proc/cpuinfo | grep "flags" | sort -u - @cat /proc/cpuinfo | grep "cpu cores" | sort -u - @cat /proc/cpuinfo | grep "physical id" | sort -u -endif - @echo "" -ifneq ($(shell which nvidia-smi 2>/dev/null),) - nvidia-smi -L - @echo "" -endif - @echo USECCACHE=$(USECCACHE) -ifeq ($(USECCACHE),1) - ccache --version | head -1 -endif - @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version -endif - @echo "" - @echo CXX=$(CXX) -ifneq ($(shell $(CXX) --version | grep ^clang),) - @echo $(CXX) -v - @$(CXX) -v |& egrep -v '(Found|multilib)' - @readelf -p .comment `$(CXX) -print-libgcc-file-name` |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print "GCC toolchain:",$$5}' -else - $(CXX) --version -endif - @echo "" - @echo FC=$(FC) - $(FC) --version - -#------------------------------------------------------------------------------- - -# Target: check (run the C++ test executable) -# [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) -check: runTest cmpFcheck cmpFGcheck -else -check: runTest cmpFcheck -endif - -# Target: runTest (run the C++ test executable runTest.exe) -runTest: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/runTest.exe - -# Target: runCheck (run the C++ standalone executable check.exe, with a small number of events) -runCheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/check.exe -p 2 32 2 - -# Target: runGcheck (run the CUDA standalone executable gcheck.exe, with a small number of events) -runGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/gcheck.exe -p 2 32 2 - -# Target: runFcheck (run the Fortran standalone executable - with C++ MEs - fcheck.exe, with a small number of events) -runFcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 - -# Target: runFGcheck (run the Fortran standalone executable - with CUDA MEs - fgcheck.exe, with a small number of events) -runFGcheck: all.$(TAG) - $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 - -# Target: cmpFcheck (compare ME results from the C++ and Fortran with C++ MEs standalone executables, with a small number of events) -cmpFcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events) -cmpFGcheck: all.$(TAG) - @echo - @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" - @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi - -# Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) -memcheck: all.$(TAG) - $(RUNTIME) $(CUDA_HOME)/bin/cuda-memcheck --check-api-memory-access yes --check-deprecated-instr yes --check-device-heap yes --demangle full --language c --leak-check full --racecheck-report all --report-api-errors all --show-backtrace yes --tool memcheck --track-unused-memory yes $(BUILDDIR)/gcheck.exe -p 2 32 2 - -#------------------------------------------------------------------------------- From cf8875b648f9524f646242b4a03dfb1bbb2828a9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 10:27:14 +0100 Subject: [PATCH 21/96] [jt774] (after merging upstream/master) fix CODEGEN cudacpp.mk: replace % by %% (code generation was failing) --- .../iolibs/template_files/gpu/cudacpp.mk | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 49a79b2674..011a5326ab 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -271,11 +271,11 @@ endif # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) - CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3% both for none and sse4 + CXXFLAGS+= -mcpu=power9 -mtune=power9 # gains ~2-3%% both for none and sse4 # Throughput references without the extra flags below: none=1.41-1.42E6, sse4=2.15-2.19E6 ###CXXFLAGS+= -DNO_WARN_X86_INTRINSICS # no change ###CXXFLAGS+= -fpeel-loops # no change - ###CXXFLAGS+= -funroll-loops # gains ~1% for none, loses ~1% for sse4 + ###CXXFLAGS+= -funroll-loops # gains ~1%% for none, loses ~1%% for sse4 ###CXXFLAGS+= -ftree-vectorize # no change ###CXXFLAGS+= -flto # would increase to none=4.08-4.12E6, sse4=4.99-5.03E6! else @@ -558,7 +558,7 @@ $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ -$(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif @@ -566,7 +566,7 @@ endif # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) -$(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) +$(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ @@ -656,7 +656,7 @@ endif #------------------------------------------------------------------------------- # Target (and build rules): Fortran include files -###$(INCDIR)/%.inc : ../%.inc +###$(INCDIR)/%%.inc : ../%%.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### \cp $< $@ @@ -672,7 +672,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),) $(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') $(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o @@ -682,12 +682,12 @@ endif #------------------------------------------------------------------------------- # Generic target and build rules: objects from Fortran compilation -$(BUILDDIR)/%.o : %.f *.inc +$(BUILDDIR)/%%.o : %%.f *.inc @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(FC) -I. -c $< -o $@ # Generic target and build rules: objects from Fortran compilation -###$(BUILDDIR)/%.o : %.f *.inc +###$(BUILDDIR)/%%.o : %%.f *.inc ### @if [ ! -d $(INCDIR) ]; then echo "mkdir -p $(INCDIR)"; mkdir -p $(INCDIR); fi ### @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi ### $(FC) -I. -I$(INCDIR) -c $< -o $@ @@ -756,7 +756,7 @@ ifneq ($(shell $(CXX) --version | grep ^Intel),) $(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') $(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 -$(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc +$(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(testmain): $(BUILDDIR)/runTest_cu.o $(testmain): cu_objects_exe += $(BUILDDIR)/runTest_cu.o @@ -946,14 +946,14 @@ cmpFcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/check.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/check.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/C++) = $${me1}"; echo "Avg ME (F77/C++) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/C++) returned NaN"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/C++) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: cmpFGcheck (compare ME results from the CUDA and Fortran with CUDA MEs standalone executables, with a small number of events) cmpFGcheck: all.$(TAG) @echo @echo "$(BUILDDIR)/gcheck.exe --common -p 2 32 2" @echo "$(BUILDDIR)/fgcheck.exe 2 32 2" - @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%s (relative difference %s 2E-4)' % ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi + @me1=$(shell $(RUNTIME) $(BUILDDIR)/gcheck.exe --common -p 2 32 2 | grep MeanMatrix | awk '{print $$4}'); me2=$(shell $(RUNTIME) $(BUILDDIR)/fgcheck.exe 2 32 2 | grep Average | awk '{print $$4}'); echo "Avg ME (C++/CUDA) = $${me1}"; echo "Avg ME (F77/CUDA) = $${me2}"; if [ "$${me2}" == "NaN" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; elif [ "$${me2}" == "" ]; then echo "ERROR! Fortran calculation (F77/CUDA) crashed"; else python3 -c "me1=$${me1}; me2=$${me2}; reldif=abs((me2-me1)/me1); print('Relative difference =', reldif); ok = reldif <= 2E-4; print ( '%%s (relative difference %%s 2E-4)' %% ( ('OK','<=') if ok else ('ERROR','>') ) ); import sys; sys.exit(0 if ok else 1)"; fi # Target: memcheck (run the CUDA standalone executable gcheck.exe with a small number of events through cuda-memcheck) memcheck: all.$(TAG) From e32bc4e6ea9ac0c3808c9644e5526c1b2bda3db2 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 10:36:44 +0100 Subject: [PATCH 22/96] [jt774] (after merging upstream/master) fix clang formatting in CODEGEN (code generation was failing clang formatting checks) --- .../template_files/cpp_model_parameters_h.inc | 40 +++++++++---------- .../template_files/gpu/MemoryAccessMomenta.h | 21 +++++----- .../iolibs/template_files/gpu/check_sa.cc | 8 ++-- .../CUDACPP_SA_OUTPUT/model_handling.py | 6 +-- 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 5ab7aa7abd..8b8797c04c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -194,8 +194,8 @@ namespace mg5amcCpu %(dcoupsetdcoup)s } %(eftspecial2)s - return out; - } + return out; + } #ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop @@ -213,33 +213,33 @@ namespace mg5amcCpu //========================================================================== #ifdef MGONGPUCPP_GPUIMPL -namespace mg5amcGpu + namespace mg5amcGpu #else -namespace mg5amcCpu + namespace mg5amcCpu #endif -{ + { #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif - // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template - __device__ inline void - G2COUP( const fptype gs[], - fptype couplings[] ) - { - mgDebug( 0, __FUNCTION__ ); - using namespace Parameters_%(model_name)s_dependentCouplings; - const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + // Compute the output couplings (e.g. gc10 and gc11) from the input gs + template + __device__ inline void + G2COUP( const fptype gs[], + fptype couplings[] ) + { + mgDebug( 0, __FUNCTION__ ); + using namespace Parameters_%(model_name)s_dependentCouplings; + const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); %(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s - mgDebug( 1, __FUNCTION__ ); - return; - } + mgDebug( 1, __FUNCTION__ ); + return; + } #pragma GCC diagnostic pop -} // end namespace mg5amcGpu/mg5amcCpu + } // end namespace mg5amcGpu/mg5amcCpu -//========================================================================== + //========================================================================== #endif // Parameters_%(model_name)s_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h index d3f5a15bd2..86df5d5471 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h @@ -27,18 +27,17 @@ namespace mg5amcCpu class MemoryAccessMomentaBase //_AOSOAv1 { public: - - // Number of Events Per Page in the momenta AOSOA memory buffer layout - // (these are all best kept as a compile-time constants: see issue #23) + // Number of Events Per Page in the momenta AOSOA memory buffer layout + // (these are all best kept as a compile-time constants: see issue #23) #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ - // ----------------------------------------------------------------------------------------------- - // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline - // --- This is relevant to ensure coalesced access to momenta in global memory - // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms - // ----------------------------------------------------------------------------------------------- - //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) - static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) - //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) + // ----------------------------------------------------------------------------------------------- + // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline + // --- This is relevant to ensure coalesced access to momenta in global memory + // --- Note that neppR is hardcoded and may differ from neppM and neppV on some platforms + // ----------------------------------------------------------------------------------------------- + //static constexpr int neppM = 64/sizeof(fptype); // 2x 32-byte GPU cache lines (512 bits): 8 (DOUBLE) or 16 (FLOAT) + static constexpr int neppM = 32/sizeof(fptype); // (DEFAULT) 32-byte GPU cache line (256 bits): 4 (DOUBLE) or 8 (FLOAT) + //static constexpr int neppM = 1; // *** NB: this is equivalent to AOS *** (slower: 1.03E9 instead of 1.11E9 in eemumu) #else // ----------------------------------------------------------------------------------------------- // --- CPUs: neppM is best set equal to the number of fptype's (neppV) in a vector register diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 7c93c07a1a..b9a05dea46 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -152,7 +152,7 @@ main( int argc, char** argv ) #ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -780,7 +780,7 @@ main( int argc, char** argv ) wrkflwtxt += "CPP:"; #endif // -- DOUBLE or FLOAT? -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */ wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) #elif defined MGONGPU_FPTYPE_DOUBLE wrkflwtxt += "DBL+"; @@ -799,7 +799,7 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif +#endif /* clang-format on */ #elif defined __HIPCC__ #if defined MGONGPU_CUCXTYPE_CXSMPL wrkflwtxt += "CXS:"; @@ -1086,7 +1086,7 @@ main( int argc, char** argv ) #elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl #else - << "\"???\"," << std::endl // no path to this statement... + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 3e0ebe545f..b585102292 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -859,11 +859,11 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupsetdpar'] = ' ' + '\n'.join( dcoupsetdpar ) dcoupsetdcoup = [ ' ' + line.replace('constexpr cxsmpl ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ] replace_dict['dcoupsetdcoup'] = ' ' + '\n'.join( dcoupsetdcoup ) - dcoupaccessbuffer = [ ' fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ] + dcoupaccessbuffer = [ ' fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n' - dcoupkernelaccess = [ ' cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ] + dcoupkernelaccess = [ ' cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n' - dcoupcompute = [ ' %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ] + dcoupcompute = [ ' %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute ) # Special handling in EFT for fptype=float using SIMD dcoupoutfptypev2 = [ ' fptype_v %sr_v;\n fptype_v %si_v;'%(name,name) for name in self.coups_dep ] From def02b58e369c76e9f3b63b1991ca1ec8e148107 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 11:01:40 +0100 Subject: [PATCH 23/96] [jt774] regenerate gg_tt.mad - the build fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CUDA_HOME=none HIP_HOME=none make |& more ... ccache g++ -O3 -std=c++17 -I. -fPIC -Wall -Wshadow -Wextra -ffast-math -fopenmp -march=skylake-avx512 -mprefer-vector-width=256 -DMGONGPU_FPTYPE_DOU BLE -DMGONGPU_FPTYPE2_DOUBLE -DMGONGPU_HAS_NO_CURAND -fPIC -c Parameters_sm.cc -o Parameters_sm.o In file included from /usr/include/c++/11/locale:41, from /usr/include/c++/11/iomanip:43, from Parameters_sm.cc:17: /usr/include/c++/11/bits/locale_facets_nonio.h:59:39: error: ‘locale’ has not been declared 59 | struct __timepunct_cache : public locale::facet | ^~~~~~ --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 22 +- epochX/cudacpp/gg_tt.mad/COPYRIGHT | 1 + .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h | 32 +-- .../gg_tt.mad/SubProcesses/BridgeKernels.cc | 9 +- .../gg_tt.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_tt.mad/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_tt.mad/SubProcesses/EventStatistics.h | 4 +- .../gg_tt.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_tt.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 7 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_tt.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 64 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttx/CudaRuntime.h | 1 - .../SubProcesses/P1_gg_ttx/check_sa.cc | 117 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 233 +++++++++++------- .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc | 16 +- .../gg_tt.mad/SubProcesses/fsampler.cc | 8 +- .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc | 12 +- .../gg_tt.mad/SubProcesses/testmisc.cc | 8 +- .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h | 4 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 58 +++-- epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 76 ++++-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h | 28 +-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h | 10 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h | 20 +- epochX/cudacpp/gg_tt.mad/src/rambo.h | 8 +- 49 files changed, 562 insertions(+), 516 deletions(-) delete mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h delete mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a477013568..b56b36111b 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005816459655761719  +DEBUG: model prefixing takes 0.005602598190307617  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.106 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.155 s +ALOHA: aloha creates 2 routines in 0.152 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.148 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.729s -user 0m1.515s -sys 0m0.204s +real 0m1.778s +user 0m1.548s +sys 0m0.220s Code generation completed in 2 seconds ************************************************************ * * @@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..86df5d5471 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -27,10 +27,9 @@ namespace mg5amcCpu class MemoryAccessMomentaBase //_AOSOAv1 { public: - // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..e167c60e14 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -283,6 +284,8 @@ namespace mg5amcCpu #endif jamp_sv[1] -= amp_sv[0]; +#include "GpuAbstraction.h" + // *** COLOR CHOICE BELOW *** // Store the leading color flows for choice of color if( jamp2_sv ) // disable color choice if nullptr @@ -302,7 +305,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +362,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +421,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +468,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +509,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +547,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +612,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +638,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +764,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +790,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +810,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +824,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +857,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1067,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..b9a05dea46 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -758,8 +761,10 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif @@ -767,13 +772,15 @@ main( int argc, char** argv ) // Workflow description summary std::string wrkflwtxt; // -- CUDA or C++? -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; #endif // -- DOUBLE or FLOAT? -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */ wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) #elif defined MGONGPU_FPTYPE_DOUBLE wrkflwtxt += "DBL+"; @@ -783,7 +790,7 @@ main( int argc, char** argv ) wrkflwtxt += "???+"; // no path to this statement #endif // -- CUCOMPLEX or THRUST or STD complex numbers? -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; #elif defined MGONGPU_CUCXTYPE_THRUST @@ -792,6 +799,12 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement +#endif /* clang-format on */ +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement #endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -893,8 +906,10 @@ main( int argc, char** argv ) #endif // Dump all configuration parameters and all results std::cout << std::string( SEP79, '*' ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,22 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +982,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1078,15 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1094,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 509307506b..24f2d49d80 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,77 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +259,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +283,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +295,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +351,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +371,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +429,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +444,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +453,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +505,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +522,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +553,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +571,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +595,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +606,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +634,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +646,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +667,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +702,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +724,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +737,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +749,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +778,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +894,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +915,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 932f123fea..06fc44c44c 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif @@ -279,33 +279,39 @@ namespace mg5amcCpu //========================================================================== +#ifdef MGONGPUCPP_GPUIMPL + namespace mg5amcGpu +#else + namespace mg5amcCpu +#endif + { #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif - // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template - __device__ inline void - G2COUP( const fptype gs[], - fptype couplings[] ) - { - mgDebug( 0, __FUNCTION__ ); - using namespace Parameters_sm_dependentCouplings; - const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - GC_10s_sv = couplings_sv.GC_10; - GC_11s_sv = couplings_sv.GC_11; - mgDebug( 1, __FUNCTION__ ); - return; - } + // Compute the output couplings (e.g. gc10 and gc11) from the input gs + template + __device__ inline void + G2COUP( const fptype gs[], + fptype couplings[] ) + { + mgDebug( 0, __FUNCTION__ ); + using namespace Parameters_sm_dependentCouplings; + const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); + GC_10s_sv = couplings_sv.GC_10; + GC_11s_sv = couplings_sv.GC_11; + mgDebug( 1, __FUNCTION__ ); + return; + } #pragma GCC diagnostic pop -} // end namespace mg5amcGpu/mg5amcCpu + } // end namespace mg5amcGpu/mg5amcCpu -//========================================================================== + //========================================================================== #endif // Parameters_sm_H diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 80032e528b..d9af210552 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -1,21 +1,37 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 +#include "GpuRuntime.h" // Includes the GPU abstraction + // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +39,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +71,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +108,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +160,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +171,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +201,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +215,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h index 905c97d700..83a46c1d4e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index e1299ba81e..dd8b83752d 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -9,6 +9,8 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" +#include "GpuAbstraction.h" + #include //========================================================================== @@ -32,7 +34,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +133,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +155,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +807,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +855,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +881,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) From d4200cf4eaa9b21ba9ec7df1cccf24a03efbd5f8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 11:21:06 +0100 Subject: [PATCH 24/96] Revert "[jt774] regenerate gg_tt.mad - the build fails" This reverts commit def02b58e369c76e9f3b63b1991ca1ec8e148107. --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 22 +- epochX/cudacpp/gg_tt.mad/COPYRIGHT | 1 - .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h | 32 +-- .../gg_tt.mad/SubProcesses/BridgeKernels.cc | 9 +- .../gg_tt.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_tt.mad/SubProcesses/CudaRuntime.h | 85 +++++++ .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_tt.mad/SubProcesses/EventStatistics.h | 4 +- .../gg_tt.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_tt.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 7 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_tt.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 64 +++-- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttx/CudaRuntime.h | 1 + .../SubProcesses/P1_gg_ttx/check_sa.cc | 117 ++++----- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 233 +++++++----------- .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc | 16 +- .../gg_tt.mad/SubProcesses/fsampler.cc | 8 +- .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc | 12 +- .../gg_tt.mad/SubProcesses/testmisc.cc | 8 +- .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h | 4 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 58 ++--- epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 76 ++---- epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h | 28 +-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h | 10 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h | 20 +- epochX/cudacpp/gg_tt.mad/src/rambo.h | 8 +- 49 files changed, 516 insertions(+), 562 deletions(-) create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index b56b36111b..a477013568 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005602598190307617  +DEBUG: model prefixing takes 0.005816459655761719  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.106 s +Wrote files for 10 helas calls in 0.103 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.152 s +ALOHA: aloha creates 2 routines in 0.155 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.148 s +ALOHA: aloha creates 4 routines in 0.135 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.778s -user 0m1.548s -sys 0m0.220s +real 0m1.729s +user 0m1.515s +sys 0m0.204s Code generation completed in 2 seconds ************************************************************ * * @@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt +No valid web browser found. Please set in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index 84a883fbb0..a134b5fef9 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -15,7 +15,6 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 89437b4c42..bf8b5e024d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // MGONGPUCPP_GPUIMPL +#endif // __CUDACC__ template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // MGONGPUCPP_GPUIMPL +#endif // __CUDACC__ // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); + checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); } else { - gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); + checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index eaf4037a24..d58066c9c1 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -1,18 +1,17 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" -#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -46,7 +45,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ namespace mg5amcCpu { @@ -97,7 +96,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h index 3efef8ce97..15eb4bff4d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc index 010bc4cbd0..985b39f576 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,16 +1,15 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" -#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc index c15b39844d..0b355a3c8d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc @@ -1,11 +1,10 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" -#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -78,7 +77,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -186,7 +185,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h index 4d9659e04e..7933ca4bbf 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h new file mode 100644 index 0000000000..64ce52f4b3 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_CUDARUNTIME_H +#define MG5AMC_CUDARUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef __CUDACC__ /* clang-format off */ +#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } +inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +{ + if( code != cudaSuccess ) + { + printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); + if( abort ) assert( code == cudaSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ +namespace mg5amcGpu +{ + // Instantiate a CudaRuntime at the beginnining of the application's main to + // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct CudaRuntime final + { + CudaRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~CudaRuntime() { tearDown( m_debug ); } + CudaRuntime( const CudaRuntime& ) = delete; + CudaRuntime( CudaRuntime&& ) = delete; + CudaRuntime& operator=( const CudaRuntime& ) = delete; + CudaRuntime& operator=( CudaRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; + checkCuda( cudaSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; + checkCuda( cudaDeviceReset() ); + } + }; + +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc index 08a16f6f2c..eb56333b03 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "GpuRuntime.h" +#include "CudaRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h index b425a5bade..48b51e0a49 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index a64c05c26a..ef40624c88 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // MGONGPUCPP_GPUIMPL +#endif // __CUDACC__ #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 81699dfea9..74b5239ebf 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation +#include "CudaRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkGpu( gpuPeekAtLastError() ); + checkCuda( cudaPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); + computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkGpu( gpuPeekAtLastError() ); - checkGpu( gpuDeviceSynchronize() ); + checkCuda( cudaPeekAtLastError() ); + checkCuda( cudaDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 72bd8f195b..23e84757a2 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index ffb76e93de..573b3bbbc9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 3afdf3e554..35a3af42e0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h index ffcdf4dbef..dc0d93afff 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h index 66f2d32a6b..3bce635718 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 4c726b30f3..31311aa375 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h index db73e4e064..c82a6c7635 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h index 3741011971..f32e6fea5b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h index 86df5d5471..29266de32c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -27,9 +27,10 @@ namespace mg5amcCpu class MemoryAccessMomentaBase //_AOSOAv1 { public: + // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifdef __CUDACC__ /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h index 18991f4fa6..b152183b28 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h index 40cb089135..e2988d39f3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h index 08faccff0f..e9b197368e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 33bef4559e..5428aaf933 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 7756a71621..3093e6ed18 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "GpuRuntime.h" +#include "CudaRuntime.h" #include "Parameters_sm.h" #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - gpuMallocHost( &( this->m_data ), this->bytes() ); + checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); } virtual ~PinnedHostBufferBase() { - gpuFreeHost( this->m_data ); + checkCuda( cudaFreeHost( this->m_data ) ); } }; #endif //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - gpuMalloc( &( this->m_data ), this->bytes() ); + checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); } virtual ~DeviceBufferBase() { - gpuFree( this->m_data ); + checkCuda( cudaFree( this->m_data ) ); } }; #endif //-------------------------------------------------------------------------- -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); + checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); } #endif //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); + checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index e167c60e14..18052b6676 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,6 +16,7 @@ #include "mgOnGpuConfig.h" +#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -45,7 +46,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -79,7 +80,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -89,7 +90,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -117,13 +118,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -150,7 +151,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -186,7 +187,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -199,10 +200,8 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events -#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop -#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -284,8 +283,6 @@ namespace mg5amcCpu #endif jamp_sv[1] -= amp_sv[0]; -#include "GpuAbstraction.h" - // *** COLOR CHOICE BELOW *** // Store the leading color flows for choice of color if( jamp2_sv ) // disable color choice if nullptr @@ -305,7 +302,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -362,7 +359,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -421,7 +418,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -468,8 +465,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef MGONGPUCPP_GPUIMPL - gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); +#ifdef __CUDACC__ + checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -509,9 +506,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef MGONGPUCPP_GPUIMPL - gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); - //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 +#ifdef __CUDACC__ + checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); + //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -547,7 +544,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] + // [Use __NVCC__ instead of __CUDACC__ here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -612,12 +609,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -638,7 +635,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifdef __CUDACC__ /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -764,9 +761,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef MGONGPUCPP_GPUIMPL - gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); - gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); +#ifdef __CUDACC__ + checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); + checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -790,7 +787,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -810,7 +807,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -824,12 +821,9 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines - -#include "GpuAbstraction.h" - // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -857,7 +851,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ +#ifdef __CUDACC__ // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1067,7 +1061,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 4a88a07226..3ebd92c038 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifdef __CUDACC__ /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifdef __CUDACC__ /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h new file mode 120000 index 0000000000..ce9e1a487a --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h @@ -0,0 +1 @@ +../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index b9a05dea46..3fbf0ffbee 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,7 +12,6 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" -#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -66,7 +65,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -97,7 +96,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -135,11 +134,9 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) -#elif defined __HIPCC__ -#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -149,10 +146,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -180,7 +177,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -201,7 +198,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -275,13 +272,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -299,14 +296,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ - // --- 00. Initialise GPU - // Instantiate a GpuRuntime at the beginnining of the application's main. - // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. - const std::string cdinKey = "00 GpuInit"; + // --- 00. Initialise cuda + // Instantiate a CudaRuntime at the beginnining of the application's main to + // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + const std::string cdinKey = "00 CudaInit"; timermap.start( cdinKey ); - GpuRuntime GpuRuntime( debug ); + CudaRuntime cudaRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -328,7 +325,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -336,7 +333,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -344,7 +341,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -352,7 +349,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -369,7 +366,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -378,7 +375,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -387,7 +384,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -395,7 +392,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -403,7 +400,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -441,7 +438,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -453,7 +450,7 @@ main( int argc, char** argv ) } else { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -464,7 +461,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -472,7 +469,7 @@ main( int argc, char** argv ) } else { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -514,7 +511,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -546,7 +543,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -591,7 +588,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -620,7 +617,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if( !bridge ) { // --- 3b. CopyDToH MEs @@ -761,10 +758,8 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ rndgentxt += " (CUDA code)"; -#elif defined __HIPCC__ - rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif @@ -772,15 +767,13 @@ main( int argc, char** argv ) // Workflow description summary std::string wrkflwtxt; // -- CUDA or C++? -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ wrkflwtxt += "CUD:"; -#elif defined __HIPCC__ - wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; #endif // -- DOUBLE or FLOAT? -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */ +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) #elif defined MGONGPU_FPTYPE_DOUBLE wrkflwtxt += "DBL+"; @@ -790,7 +783,7 @@ main( int argc, char** argv ) wrkflwtxt += "???+"; // no path to this statement #endif // -- CUCOMPLEX or THRUST or STD complex numbers? -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; #elif defined MGONGPU_CUCXTYPE_THRUST @@ -799,12 +792,6 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif /* clang-format on */ -#elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL - wrkflwtxt += "CXS:"; -#else - wrkflwtxt += "???:"; // no path to this statement #endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX @@ -831,7 +818,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -887,7 +874,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -906,10 +893,8 @@ main( int argc, char** argv ) #endif // Dump all configuration parameters and all results std::cout << std::string( SEP79, '*' ) << std::endl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" -#elif defined __HIPCC__ - << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -936,22 +921,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL - << "Complex type = STD::COMPLEX" << std::endl +#endif #else - << "Complex type = ???" << std::endl // no path to this statement... + << "Complex type = STD::COMPLEX" << std::endl #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -982,7 +966,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1078,15 +1062,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL - << "\"STD::COMPLEX\"," << std::endl +#endif #else - << "\"???\"," << std::endl // no path to this statement... + << "\"STD::COMPLEX\"," << std::endl #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1094,7 +1077,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc index 79abbcc4f8..da68aa9255 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "GpuRuntime.h" +#include "CudaRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ void RamboSamplingKernelDevice::getMomentaInitial() { - gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); + getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ void RamboSamplingKernelDevice::getMomentaFinal() { - gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h index 7c214cd74b..184089efd7 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h index 21d63beeac..188a72c2c9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 24f2d49d80..509307506b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA/HIP +#=== Configure common compiler flags for C++ and CUDA INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,46 +121,24 @@ endif #------------------------------------------------------------------------------- -#=== Configure the GPU compiler (CUDA or HIP) +#=== Configure the CUDA compiler -# FIXME! (AV 24.01.2024) -# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. -# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. -# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. -# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). - -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) -# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) +# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the path to nvcc +# If CUDA_HOME is not set, try to set it from the location of nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# If HIP_HOME is not set, try to set it from the path to hipcc -ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) - $(warning HIP_HOME was not set: using "$(HIP_HOME)") -endif - -# FIXME! (AV 24.01.2024) -# In the current implementation (without separate builds for C++ and CUDA/HIP), -# builds are performed for HIP only if CUDA is not found in the path. -# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. -# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). - -#--- Option 1: CUDA exists -> use CUDA - -# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists +# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - - GPUCC = $(CUDA_HOME)/bin/nvcc + NVCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -180,77 +158,41 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) - CUBUILDRULEFLAGS = -Xcompiler -fPIC -c - CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu - CUDATESTFLAGS = -lcuda - - # Set the host C++ compiler for GPUCC via "-ccbin " - # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) - GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - - # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) - ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) - GPUFLAGS += -allow-unsupported-compiler - endif - + ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) else ifneq ($(origin REQUIRE_CUDA),undefined) - # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) - -#--- Option 2: CUDA does not exist, HIP exists -> use HIP - -# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists -else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) - - GPUCC = $(HIP_HOME)/bin/hipcc - #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? - HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a - HIPINC = -I$(HIP_HOME)/include/ - # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP - # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC - ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - GPUFLAGS += -std=c++17 - ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) - CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c - -else ifneq ($(origin REQUIRE_HIP),undefined) - - # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) - $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) - -#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP - + $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) else - - # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ + # No cuda. Switch cuda compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) - override GPUCC= + override NVCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= - endif +export NVCC +export CUFLAGS -# Export GPUCC (so that it can also be used in cudacpp_src.mk?) -export GPUCC -export GPUFLAGS +# Set the host C++ compiler for nvcc via "-ccbin " +# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) +CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + +# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) +ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) +CUFLAGS += -allow-unsupported-compiler +endif #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA/HIP builds +#=== Configure ccache for C++ and CUDA builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -259,15 +201,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(GPUCC),) - ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) - override GPUCC:=ccache $(GPUCC) +ifneq ($(NVCC),) + ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) + override NVCC:=ccache $(NVCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP +#=== Configure PowerPC-specific compiler flags for C++ and CUDA # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -283,9 +225,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) +# PowerPC-specific CUDA compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - GPUFLAGS+= -Xcompiler -mno-float128 + CUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -295,7 +237,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -351,10 +293,7 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(GPUCC),) - override RNDGEN = hasNoCurand - # Edgecase for HIP compilation - else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + ifeq ($(NVCC),) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -371,7 +310,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -429,13 +368,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -444,7 +383,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - GPUFLAGS += -DMGONGPU_INLINE_HELAMPS + CUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -453,7 +392,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - GPUFLAGS += -DMGONGPU_HARDCODE_PARAM + CUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -505,11 +444,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -522,7 +461,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(GPUCC),) +ifneq ($(NVCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -553,16 +492,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(GPUCC),) +ifneq ($(NVCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ + $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ + $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -571,14 +509,11 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) -# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifeq ($(findstring nvcc,$(GPUCC)),nvcc) - $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math -else - $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math +ifneq ($(NVCC),) +$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math endif endif @@ -595,10 +530,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(GPUCC),) -GPUFLAGS += -Wno-deprecated-builtins +ifneq ($(NVCC),) +CUFLAGS += -Xcompiler -Wno-deprecated-builtins endif endif @@ -606,8 +541,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(GPUCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(NVCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -634,7 +569,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(GPUCC),) +ifneq ($(NVCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -646,11 +581,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(GPUCC),) +ifneq ($(NVCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -667,16 +602,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(GPUCC),) +ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -702,17 +637,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(GPUCC),) +ifneq ($(NVCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -724,7 +659,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(GPUCC),) +ifneq ($(NVCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -737,7 +672,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(GPUCC),) +ifneq ($(NVCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -749,12 +684,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(GPUCC),) +ifneq ($(NVCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -778,14 +713,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(GPUCC),) # link only runTest.o +ifeq ($(NVCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) + $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda endif # Use target gtestlibs to build only googletest @@ -894,9 +829,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo GPUCC=$(GPUCC) -ifneq ($(GPUCC),) - $(GPUCC) --version + @echo NVCC=$(NVCC) +ifneq ($(NVCC),) + $(NVCC) --version endif @echo "" @echo CXX=$(CXX) @@ -915,7 +850,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(GPUCC),) +ifneq ($(NVCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 22ce3f5115..2d2b36d560 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "GpuRuntime.h" +#include "CudaRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef MGONGPUCPP_GPUIMPL - GpuRuntime::setUp(); +#ifdef __CUDACC__ + CudaRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef MGONGPUCPP_GPUIMPL - GpuRuntime::tearDown(); +#ifdef __CUDACC__ + CudaRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc index 3743934f41..2fb445372d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index de327f2321..d4a760a71b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index ba9e59a8a3..895d6eeb56 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc index e5167de00c..3361fe5aa9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index add8fce575..55f43bb43a 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index c5dd6e7e4c..a9bc93ff98 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 06fc44c44c..932f123fea 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif @@ -279,39 +279,33 @@ namespace mg5amcCpu //========================================================================== -#ifdef MGONGPUCPP_GPUIMPL - namespace mg5amcGpu -#else - namespace mg5amcCpu -#endif - { #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif - // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template - __device__ inline void - G2COUP( const fptype gs[], - fptype couplings[] ) - { - mgDebug( 0, __FUNCTION__ ); - using namespace Parameters_sm_dependentCouplings; - const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); - fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); - fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); - cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); - cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); - GC_10s_sv = couplings_sv.GC_10; - GC_11s_sv = couplings_sv.GC_11; - mgDebug( 1, __FUNCTION__ ); - return; - } + // Compute the output couplings (e.g. gc10 and gc11) from the input gs + template + __device__ inline void + G2COUP( const fptype gs[], + fptype couplings[] ) + { + mgDebug( 0, __FUNCTION__ ); + using namespace Parameters_sm_dependentCouplings; + const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + fptype* GC_10s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_10 ); + fptype* GC_11s = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_GC_11 ); + cxtype_sv_ref GC_10s_sv = C_ACCESS::kernelAccess( GC_10s ); + cxtype_sv_ref GC_11s_sv = C_ACCESS::kernelAccess( GC_11s ); + GC_10s_sv = couplings_sv.GC_10; + GC_11s_sv = couplings_sv.GC_11; + mgDebug( 1, __FUNCTION__ ); + return; + } #pragma GCC diagnostic pop - } // end namespace mg5amcGpu/mg5amcCpu +} // end namespace mg5amcGpu/mg5amcCpu - //========================================================================== +//========================================================================== #endif // Parameters_sm_H diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index 159e19a46d..d4cc628aec 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) +#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) -###$(info GPUCC=$(GPUCC)) +###$(info NVCC=$(NVCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,13 +92,6 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP -ifeq ($(findstring nvcc,$(GPUCC)),nvcc) - GPUFLAGS += -Xcompiler -fPIC -c -x cu -else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c -endif - # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -260,20 +253,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ + $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(GPUCC),) +ifneq ($(NVCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(GPUCC),) +ifneq ($(NVCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index d9af210552..80032e528b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -1,37 +1,21 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 -#include "GpuRuntime.h" // Includes the GPU abstraction - // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 -// Is this a GPU (CUDA, HIP) or CPU implementation? -#ifdef __CUDACC__ -#define MGONGPUCPP_GPUIMPL cuda -#elif defined __HIPCC__ -#define MGONGPUCPP_GPUIMPL hip -#include "hip/hip_runtime.h" -#else -#undef MGONGPUCPP_GPUIMPL -#endif - // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers -// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) -#if defined __HIPCC__ -#define MGONGPU_HAS_NO_CURAND 1 -#else +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -39,7 +23,6 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif -#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -71,28 +54,23 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#ifndef __CUDACC__ +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) +#endif + +// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) - -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) -#elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) - -// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#else -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default in CUDA +#undef MGONGPU_NSIGHT_DEBUG // default //#define MGONGPU_NSIGHT_DEBUG 1 -#else -#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -108,21 +86,17 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (CUDA complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA -#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA -#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA +// SANITY CHECKS (c++ complex number implementation) +#ifndef __CUDACC__ +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL #endif #endif -// SANITY CHECKS (C++ complex number implementation) -#ifndef MGONGPUCPP_GPUIMPL -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ +// SANITY CHECKS (cuda complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif #endif @@ -160,7 +134,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -171,7 +145,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD +#ifdef __CUDACC__ // CUDA implementation has no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -201,9 +175,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -215,8 +189,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA/HIP declaration specifiers for C++ -#ifndef MGONGPUCPP_GPUIMPL +// Define empty CUDA declaration specifiers for C++ +#ifndef __CUDACC__ #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..ca9a9f00c0 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h index 83a46c1d4e..905c97d700 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef MGONGPUCPP_GPUIMPL +#endif // #ifdef __CUDACC__ //========================================================================== -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef MGONGPUCPP_GPUIMPL +#endif // #ifndef __CUDACC__ //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index dd8b83752d..e1299ba81e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -9,8 +9,6 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" -#include "GpuAbstraction.h" - #include //========================================================================== @@ -34,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -133,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) const int neppV = 1; @@ -155,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // Printout to stream for user defined types @@ -807,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef MGONGPUCPP_GPUIMPL +#endif // #ifndef __CUDACC__ //========================================================================== -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ //------------------------------ // Vector types - CUDA @@ -855,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef MGONGPUCPP_GPUIMPL +#endif // #ifdef __CUDACC__ //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -881,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#ifdef __CUDACC__ /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h index cd7e1008ea..e02ea52496 100644 --- a/epochX/cudacpp/gg_tt.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef MGONGPUCPP_GPUIMPL +#ifndef __CUDACC__ // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) From ce629c2b7b101b1c3b6e943eaf30c689ffd3d5bb Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 17:14:51 +0100 Subject: [PATCH 25/96] [jt774] remove '#include "GpuAbstraction.h"' from CODEGEN mgOnGpuVectors.h and process_matrix.inc as in branch jthip24 These are changes that in that branch I included in commitcommit 6e90139833db998d1f6b2546d16c33c357804b24 (Tue Jul 18 18:25:34 2023 +0200), which consisted in a backport to CODEGEN of earlier changes in ggttggg.mad. --- .../madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h | 2 -- .../madgraph/iolibs/template_files/gpu/process_matrix.inc | 2 -- 2 files changed, 4 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index dd8b83752d..cdae04326b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -9,8 +9,6 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" -#include "GpuAbstraction.h" - #include //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 84e324a679..960f029d8d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -7,8 +7,6 @@ ! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== -#include "GpuAbstraction.h" - // *** COLOR CHOICE BELOW *** // Store the leading color flows for choice of color if( jamp2_sv ) // disable color choice if nullptr From 7363e1fde1cda981c64746eb095abd94425d1266 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 18 Jul 2023 19:06:32 +0200 Subject: [PATCH 26/96] [jthip] in CODEGEN, remove the copying to src of GpuRuntime.h and GpuAbstraction.h --- epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 3e1d68d1fb..c89295c01f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -88,7 +88,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', - s+'CMake/src/CMakeLists.txt', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h'], + s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', From 47e2b8f71ddd18df57e3b01be8b8cfeba8b8ad0a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 17:22:32 +0100 Subject: [PATCH 27/96] [jt774] in CODEGEN mgOnGpuFptypes.h, replace one more __CUDACC__ by MGONGPUCPP_GPUIMPL... not clear why this was not done yet In branch jthip24, this is coming from Jorgen's commit 6741186a50be8c16a09630a959a6327d2b4a7a8a (Thu Jul 13 15:15:41 2023 +0200) which includes many such changes --- .../madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h index 83a46c1d4e..fa3a02664b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu From 721652e3d25b4332df673f92134f11a759dc5454 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 18 Jul 2023 18:11:04 +0200 Subject: [PATCH 28/96] [jt774] cherry-pick commit 1b5c0fdff ([jthip] backport to CODEGEN from ggttgg.mad on Tue Jul 18 18:11:04 2023 +0200) Fix conflicts: epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h NB: this is very strange, because this same commit 1b5c0fdff is already included in the jt774 branch earlier on... --- .../iolibs/template_files/gpu/check_sa.cc | 18 +++++++++++------- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 3 --- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index b9a05dea46..c336edb68a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -761,7 +761,7 @@ main( int argc, char** argv ) rndgentxt = "CURAND HOST"; else if( rndgen == RandomNumberMode::CurandDevice ) rndgentxt = "CURAND DEVICE"; -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ rndgentxt += " (CUDA code)"; #elif defined __HIPCC__ rndgentxt += " (HIP code)"; @@ -771,8 +771,8 @@ main( int argc, char** argv ) // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? -#ifdef MGONGPUCPP_GPUIMPL + // -- CUDA or HIP or C++? +#ifdef __CUDACC__ wrkflwtxt += "CUD:"; #elif defined __HIPCC__ wrkflwtxt += "HIP:"; @@ -790,7 +790,7 @@ main( int argc, char** argv ) wrkflwtxt += "???+"; // no path to this statement #endif // -- CUCOMPLEX or THRUST or STD complex numbers? -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; #elif defined MGONGPU_CUCXTYPE_THRUST @@ -806,6 +806,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -906,7 +912,7 @@ main( int argc, char** argv ) #endif // Dump all configuration parameters and all results std::cout << std::string( SEP79, '*' ) << std::endl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" #elif defined __HIPCC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" @@ -936,7 +942,6 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST @@ -1078,7 +1083,6 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index bca351fa89..46a8f0efc0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -6,8 +6,6 @@ #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 -#include "GpuRuntime.h" // Includes the GPU abstraction - // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) %(mgongpu_supports_multichannel)s @@ -17,7 +15,6 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip -#include "hip/hip_runtime.h" #else #undef MGONGPUCPP_GPUIMPL #endif From 71a9ece3cf70058569457f44f95f50b16ba51421 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 24 Jan 2024 16:35:03 +0100 Subject: [PATCH 29/96] [jthip24] (after merging upstream/master) fix clang formatting in CODEGEN (code generation was failing clang formatting checks) This is a cherry-pick of f44a9c77344c1dd2f18c08e48715fe723a32e588 from jthip24 --- .../madgraph/iolibs/template_files/gpu/check_sa.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index c336edb68a..748ecad0cc 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -778,7 +778,7 @@ main( int argc, char** argv ) wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */ wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -788,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX From 809001313092940ef7799b9eec7d119cb229e82e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 17:09:21 +0100 Subject: [PATCH 30/96] [jt774] copy CODEGEN check_sa.cc from jthip24 as-is (currently f44a9c77344c1dd2f18c08e48715fe723a32e588) Code generation is now succeeding (it was previously failing in clang-format) git checkout f44a9c77344c1dd2f18c08e48715fe723a32e588 CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc --- .../madgraph/iolibs/template_files/gpu/check_sa.cc | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 748ecad0cc..7cac5ab47b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -780,7 +780,7 @@ main( int argc, char** argv ) wrkflwtxt += "CPP:"; #endif /* clang-format off */ // -- DOUBLE or FLOAT? -#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT /* clang-format off */ +#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) #elif defined MGONGPU_FPTYPE_DOUBLE wrkflwtxt += "DBL+"; @@ -799,12 +799,6 @@ main( int argc, char** argv ) wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement -#endif /* clang-format on */ -#elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL - wrkflwtxt += "CXS:"; -#else - wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ #if defined MGONGPU_CUCXTYPE_CXSMPL @@ -1090,7 +1084,7 @@ main( int argc, char** argv ) #elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl #else - << "\"???\"," << std::endl // no path to this statement... + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" From 9a8c86c144cdba81eea5dd6f111eb40d90cbd537 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 17:14:11 +0100 Subject: [PATCH 31/96] [jt774] add one empty line in CODEGEN MemoryAccessMomenta.h as in jthip24 (I accidentally removed it in commit e32bc4e6ea9ac0c3808c9644e5526c1b2bda3db2 on Wed Jan 24 10:36:44 2024 +0100) --- .../madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h | 1 + 1 file changed, 1 insertion(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h index 86df5d5471..3be229d392 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h @@ -27,6 +27,7 @@ namespace mg5amcCpu class MemoryAccessMomentaBase //_AOSOAv1 { public: + // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) #ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ From 0a069c7e4f1c90dd5fd285b4621f766f3f6c230a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 16:46:19 +0100 Subject: [PATCH 32/96] [jthip24] Remove hip_runtime.h from CODEGEN mgOnGpuConfig.h and add it back to GpuAbstraction.h Revert "[CODEGEN] Added HIP runtime include in mgOnGpuConfig.h in codegen" This reverts Jorgen's commit 35913a385f9961f4ca8e67aabaa37940149c5aa5 (2023-07-13 15:15:41 +0200) This is a cherry-pick in jt774 of d1131c1acbbb9688e0d1224abf228f5f074501cf from jthip24 --- .../madgraph/iolibs/template_files/gpu/GpuAbstraction.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 9c467b1e04..6a7d9c05c0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -39,6 +39,8 @@ #elif defined __HIPCC__ +#include "hip/hip_runtime.h" + #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString From 89170af9217e16fc74acbbdc705dc92855251bf6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 17:29:59 +0100 Subject: [PATCH 33/96] [jt774] copy CODEGEN cpp_model_parameters_h.inc from jthip24 as-is (currently af0f0d4458fd5089ff47188b5631e6aa8e1014f3) This is meant to fix the build of the code generated ggtt.mad (it was failing before in jt774 while it succeeds in jthip24) However code generation is now failing in clang formatting git checkout af0f0d4458fd5089ff47188b5631e6aa8e1014f3 CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc --- .../template_files/cpp_model_parameters_h.inc | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 8b8797c04c..94b8dd6444 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -212,34 +212,28 @@ namespace mg5amcCpu //========================================================================== -#ifdef MGONGPUCPP_GPUIMPL - namespace mg5amcGpu -#else - namespace mg5amcCpu -#endif - { #pragma GCC diagnostic push #ifndef __clang__ #pragma GCC diagnostic ignored "-Wunused-but-set-variable" // e.g. <> #endif - // Compute the output couplings (e.g. gc10 and gc11) from the input gs - template - __device__ inline void - G2COUP( const fptype gs[], - fptype couplings[] ) - { - mgDebug( 0, __FUNCTION__ ); - using namespace Parameters_%(model_name)s_dependentCouplings; - const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); - DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); + // Compute the output couplings (e.g. gc10 and gc11) from the input gs + template + __device__ inline void + G2COUP( const fptype gs[], + fptype couplings[] ) + { + mgDebug( 0, __FUNCTION__ ); + using namespace Parameters_%(model_name)s_dependentCouplings; + const fptype_sv& gs_sv = G_ACCESS::kernelAccessConst( gs ); + DependentCouplings_sv couplings_sv = computeDependentCouplings_fromG( gs_sv ); %(dcoupaccessbuffer)s%(dcoupkernelaccess)s%(dcoupcompute)s - mgDebug( 1, __FUNCTION__ ); - return; - } + mgDebug( 1, __FUNCTION__ ); + return; + } #pragma GCC diagnostic pop - } // end namespace mg5amcGpu/mg5amcCpu +} // end namespace mg5amcGpu/mg5amcCpu - //========================================================================== +//========================================================================== #endif // Parameters_%(model_name)s_H From 69d5ed60ba3f202b4350f0e1bc312125b9746c9e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 17:34:17 +0100 Subject: [PATCH 34/96] [jt774] fix clang formatting in CODEGEN model_handling.py after the previous commit (undo the changes here from commit e32bc4e6ea9ac0c3808c9644e5526c1b2bda3db2 on Wed Jan 24 10:36:44 2024 +0100) The code generated ggtt.mad is now finally succeeding in jt774, as it does in jthip24. HOWEVER the two code branches are not identical yet, there is still a minor difference in makefiles --- .../CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index b585102292..3e0ebe545f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -859,11 +859,11 @@ def super_generate_parameters_class_files(self): replace_dict['dcoupsetdpar'] = ' ' + '\n'.join( dcoupsetdpar ) dcoupsetdcoup = [ ' ' + line.replace('constexpr cxsmpl ','out.').replace('mdl_complexi', 'cI') for line in self.write_hardcoded_parameters(list(self.coups_dep.values())).split('\n') if line != '' ] replace_dict['dcoupsetdcoup'] = ' ' + '\n'.join( dcoupsetdcoup ) - dcoupaccessbuffer = [ ' fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ] + dcoupaccessbuffer = [ ' fptype* %ss = C_ACCESS::idcoupAccessBuffer( couplings, idcoup_%s );'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupaccessbuffer'] = '\n'.join( dcoupaccessbuffer ) + '\n' - dcoupkernelaccess = [ ' cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ] + dcoupkernelaccess = [ ' cxtype_sv_ref %ss_sv = C_ACCESS::kernelAccess( %ss );'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupkernelaccess'] = '\n'.join( dcoupkernelaccess ) + '\n' - dcoupcompute = [ ' %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ] + dcoupcompute = [ ' %ss_sv = couplings_sv.%s;'%( name, name ) for name in self.coups_dep ] replace_dict['dcoupcompute'] = '\n'.join( dcoupcompute ) # Special handling in EFT for fptype=float using SIMD dcoupoutfptypev2 = [ ' fptype_v %sr_v;\n fptype_v %si_v;'%(name,name) for name in self.coups_dep ] From d43cfeb5d02ded294cc291bcb6a9b58fec379287 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 17:43:48 +0100 Subject: [PATCH 35/96] [jt774] copy CODEGEN cudacpp.mk from jthip24 as-is (currently 4ba21335e6ae4a0b1ea379cfdf565d72030f7a2e) git checkout 4ba21335e6ae4a0b1ea379cfdf565d72030f7a2e CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk NB: CODEGEN in jt774 is now identical to that in jthip24! --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 011a5326ab..dbca8e330f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -180,7 +180,8 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h @@ -573,7 +574,7 @@ $(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) # Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) -$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math +$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math ifeq ($(findstring nvcc,$(GPUCC)),nvcc) $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math From 5e424fe41675a179291eb9b2d8d8277131c01fe5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 17:46:29 +0100 Subject: [PATCH 36/96] [jt774] regenerate ggtt.mad - add add the previously absent GpuAbstraction.h and GpuRuntime.h files and symlinks --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 22 +- epochX/cudacpp/gg_tt.mad/COPYRIGHT | 1 + .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h | 32 +-- .../gg_tt.mad/SubProcesses/BridgeKernels.cc | 9 +- .../gg_tt.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_tt.mad/SubProcesses/EventStatistics.h | 4 +- .../gg_tt.mad/SubProcesses/GpuAbstraction.h | 71 ++++++ .../{CudaRuntime.h => GpuRuntime.h} | 54 ++-- .../gg_tt.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_tt.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_tt.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttx/CudaRuntime.h | 1 - .../SubProcesses/P1_gg_ttx/GpuAbstraction.h | 1 + .../SubProcesses/P1_gg_ttx/GpuRuntime.h | 1 + .../SubProcesses/P1_gg_ttx/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../cudacpp/gg_tt.mad/SubProcesses/fbridge.cc | 16 +- .../gg_tt.mad/SubProcesses/fsampler.cc | 8 +- .../cudacpp/gg_tt.mad/SubProcesses/runTest.cc | 12 +- .../gg_tt.mad/SubProcesses/testmisc.cc | 8 +- .../cudacpp/gg_tt.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h | 4 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 73 ++++-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h | 28 +-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h | 12 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_tt.mad/src/rambo.h | 8 +- 52 files changed, 625 insertions(+), 434 deletions(-) create mode 100644 epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h rename epochX/cudacpp/gg_tt.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%) delete mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a477013568..360771ac98 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005816459655761719  +DEBUG: model prefixing takes 0.005492210388183594  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.155 s +ALOHA: aloha creates 2 routines in 0.145 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.131 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.729s -user 0m1.515s -sys 0m0.204s +real 0m1.713s +user 0m1.482s +sys 0m0.227s Code generation completed in 2 seconds ************************************************************ * * @@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..f20c229897 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +610,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +636,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +762,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +788,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +808,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +822,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +855,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 932f123fea..5f2f4391b9 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) From 464703b6f6e96f7b3585663e41ec435b709e2cc5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 18:08:40 +0100 Subject: [PATCH 37/96] [jt774] *** COMPLETE SYNC OF JTHIP24 AND JT774 *** regenerate all processes - add to repo Gpu*.h when missing *** NB Now all processes in the repo are the same as in jthip24 (including codegen logs as I copied those of jt774 to jthip24) *** *** NB Now jthip24 is identical to jt774, except that jthip24 also contains extra files in .github/workflows and in tools for CI and profiling *** --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 22 +- epochX/cudacpp/ee_mumu.mad/COPYRIGHT | 1 + .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 32 +-- .../ee_mumu.mad/SubProcesses/BridgeKernels.cc | 9 +- .../ee_mumu.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../ee_mumu.mad/SubProcesses/GpuAbstraction.h | 71 ++++++ .../{CudaRuntime.h => GpuRuntime.h} | 54 ++-- .../ee_mumu.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../ee_mumu.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../ee_mumu.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_epem_mupmum/CPPProcess.h | 10 +- .../SubProcesses/P1_epem_mupmum/CudaRuntime.h | 1 - .../P1_epem_mupmum/GpuAbstraction.h | 1 + .../SubProcesses/P1_epem_mupmum/GpuRuntime.h | 1 + .../SubProcesses/P1_epem_mupmum/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../ee_mumu.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../ee_mumu.mad/SubProcesses/fbridge.cc | 16 +- .../ee_mumu.mad/SubProcesses/fsampler.cc | 8 +- .../ee_mumu.mad/SubProcesses/runTest.cc | 12 +- .../ee_mumu.mad/SubProcesses/testmisc.cc | 8 +- .../ee_mumu.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h | 4 +- .../cudacpp/ee_mumu.mad/src/Parameters_sm.cc | 4 +- .../cudacpp/ee_mumu.mad/src/Parameters_sm.h | 10 +- epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk | 23 +- .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/ee_mumu.mad/src/rambo.h | 8 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 +- epochX/cudacpp/ee_mumu.sa/COPYRIGHT | 1 + .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h | 32 +-- .../ee_mumu.sa/SubProcesses/BridgeKernels.cc | 9 +- .../ee_mumu.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../ee_mumu.sa/SubProcesses/EventStatistics.h | 4 +- .../ee_mumu.sa/SubProcesses/GpuAbstraction.h | 71 ++++++ .../SubProcesses/GpuRuntime.h} | 54 ++-- .../ee_mumu.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../ee_mumu.sa/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../ee_mumu.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_epem_mupmum/CPPProcess.h | 10 +- .../P1_Sigma_sm_epem_mupmum/CudaRuntime.h | 1 - .../P1_Sigma_sm_epem_mupmum/GpuAbstraction.h | 1 + .../P1_Sigma_sm_epem_mupmum/GpuRuntime.h | 1 + .../P1_Sigma_sm_epem_mupmum/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../ee_mumu.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../ee_mumu.sa/SubProcesses/fbridge.cc | 16 +- .../ee_mumu.sa/SubProcesses/fsampler.cc | 8 +- .../ee_mumu.sa/SubProcesses/runTest.cc | 12 +- .../ee_mumu.sa/SubProcesses/testmisc.cc | 8 +- .../ee_mumu.sa/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h | 4 +- .../cudacpp/ee_mumu.sa/src/Parameters_sm.cc | 4 +- epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h | 10 +- epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk | 23 +- epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/ee_mumu.sa/src/rambo.h | 8 +- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 14 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 +- epochX/cudacpp/gg_tt.sa/COPYRIGHT | 1 + epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h | 32 +-- .../gg_tt.sa/SubProcesses/BridgeKernels.cc | 9 +- .../gg_tt.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_tt.sa/SubProcesses/EventStatistics.h | 4 +- .../gg_tt.sa/SubProcesses/GpuAbstraction.h | 71 ++++++ .../SubProcesses/GpuRuntime.h} | 54 ++-- .../gg_tt.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_tt.sa/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_tt.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_sm_gg_ttx/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_gg_ttx/CPPProcess.h | 10 +- .../P1_Sigma_sm_gg_ttx/CudaRuntime.h | 1 - .../P1_Sigma_sm_gg_ttx/GpuAbstraction.h | 1 + .../P1_Sigma_sm_gg_ttx/GpuRuntime.h | 1 + .../P1_Sigma_sm_gg_ttx/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../cudacpp/gg_tt.sa/SubProcesses/fbridge.cc | 16 +- .../cudacpp/gg_tt.sa/SubProcesses/fsampler.cc | 8 +- .../cudacpp/gg_tt.sa/SubProcesses/runTest.cc | 12 +- .../cudacpp/gg_tt.sa/SubProcesses/testmisc.cc | 8 +- .../cudacpp/gg_tt.sa/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h | 4 +- epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h | 73 ++++-- epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h | 28 +-- epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h | 12 +- epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_tt.sa/src/rambo.h | 8 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 28 +-- epochX/cudacpp/gg_tt01g.mad/COPYRIGHT | 1 + .../gg_tt01g.mad/SubProcesses/Bridge.h | 32 +-- .../SubProcesses/BridgeKernels.cc | 9 +- .../gg_tt01g.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../SubProcesses/GpuAbstraction.h | 71 ++++++ .../{CudaRuntime.h => GpuRuntime.h} | 54 ++-- .../gg_tt01g.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_tt01g.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gg_ttx/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttx/CudaRuntime.h | 1 - .../SubProcesses/P1_gg_ttx/GpuAbstraction.h | 1 + .../SubProcesses/P1_gg_ttx/GpuRuntime.h | 1 + .../SubProcesses/P1_gg_ttx/check_sa.cc | 111 +++++---- .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_gg_ttxg/CPPProcess.h | 10 +- .../SubProcesses/P2_gg_ttxg/CudaRuntime.h | 1 - .../SubProcesses/P2_gg_ttxg/GpuAbstraction.h | 1 + .../SubProcesses/P2_gg_ttxg/GpuRuntime.h | 1 + .../SubProcesses/P2_gg_ttxg/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gg_tt01g.mad/SubProcesses/fbridge.cc | 16 +- .../gg_tt01g.mad/SubProcesses/fsampler.cc | 8 +- .../gg_tt01g.mad/SubProcesses/runTest.cc | 12 +- .../gg_tt01g.mad/SubProcesses/testmisc.cc | 8 +- .../gg_tt01g.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h | 4 +- .../cudacpp/gg_tt01g.mad/src/Parameters_sm.cc | 4 +- .../cudacpp/gg_tt01g.mad/src/Parameters_sm.h | 10 +- .../cudacpp/gg_tt01g.mad/src/cudacpp_src.mk | 23 +- .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_tt01g.mad/src/rambo.h | 8 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 22 +- epochX/cudacpp/gg_ttg.mad/COPYRIGHT | 1 + .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h | 32 +-- .../gg_ttg.mad/SubProcesses/BridgeKernels.cc | 9 +- .../gg_ttg.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_ttg.mad/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_ttg.mad/SubProcesses/EventStatistics.h | 4 +- .../gg_ttg.mad/SubProcesses/GpuAbstraction.h | 71 ++++++ .../gg_ttg.mad/SubProcesses/GpuRuntime.h | 85 +++++++ .../gg_ttg.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_ttg.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_ttg.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gg_ttxg/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttxg/CudaRuntime.h | 1 - .../SubProcesses/P1_gg_ttxg/GpuAbstraction.h | 1 + .../SubProcesses/P1_gg_ttxg/GpuRuntime.h | 1 + .../SubProcesses/P1_gg_ttxg/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gg_ttg.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gg_ttg.mad/SubProcesses/fbridge.cc | 16 +- .../gg_ttg.mad/SubProcesses/fsampler.cc | 8 +- .../gg_ttg.mad/SubProcesses/runTest.cc | 12 +- .../gg_ttg.mad/SubProcesses/testmisc.cc | 8 +- .../gg_ttg.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h | 4 +- .../cudacpp/gg_ttg.mad/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_ttg.mad/src/rambo.h | 8 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 18 +- epochX/cudacpp/gg_ttg.sa/COPYRIGHT | 1 + .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h | 32 +-- .../gg_ttg.sa/SubProcesses/BridgeKernels.cc | 9 +- .../gg_ttg.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_ttg.sa/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_ttg.sa/SubProcesses/EventStatistics.h | 4 +- .../gg_ttg.sa/SubProcesses/GpuAbstraction.h | 71 ++++++ .../gg_ttg.sa/SubProcesses/GpuRuntime.h | 85 +++++++ .../gg_ttg.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_ttg.sa/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_ttg.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_gg_ttxg/CPPProcess.h | 10 +- .../P1_Sigma_sm_gg_ttxg/CudaRuntime.h | 1 - .../P1_Sigma_sm_gg_ttxg/GpuAbstraction.h | 1 + .../P1_Sigma_sm_gg_ttxg/GpuRuntime.h | 1 + .../P1_Sigma_sm_gg_ttxg/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc | 16 +- .../gg_ttg.sa/SubProcesses/fsampler.cc | 8 +- .../cudacpp/gg_ttg.sa/SubProcesses/runTest.cc | 12 +- .../gg_ttg.sa/SubProcesses/testmisc.cc | 8 +- .../cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h | 4 +- epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h | 73 ++++-- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h | 28 +-- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h | 12 +- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_ttg.sa/src/rambo.h | 8 +- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 28 +-- epochX/cudacpp/gg_ttgg.mad/COPYRIGHT | 1 + .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h | 32 +-- .../gg_ttgg.mad/SubProcesses/BridgeKernels.cc | 9 +- .../gg_ttgg.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_ttgg.mad/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 32 +-- .../gg_ttgg.mad/SubProcesses/GpuRuntime.h | 85 +++++++ .../gg_ttgg.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 22 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_ttgg.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_ttgg.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gg_ttxgg/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttxgg/CudaRuntime.h | 1 - .../SubProcesses/P1_gg_ttxgg/GpuAbstraction.h | 1 + .../SubProcesses/P1_gg_ttxgg/GpuRuntime.h | 1 + .../SubProcesses/P1_gg_ttxgg/check_sa.cc | 107 ++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gg_ttgg.mad/SubProcesses/fbridge.cc | 16 +- .../gg_ttgg.mad/SubProcesses/fsampler.cc | 8 +- .../gg_ttgg.mad/SubProcesses/runTest.cc | 10 +- .../gg_ttgg.mad/SubProcesses/testmisc.cc | 8 +- .../gg_ttgg.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h | 4 +- .../cudacpp/gg_ttgg.mad/src/Parameters_sm.cc | 4 +- .../cudacpp/gg_ttgg.mad/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 23 +- .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 71 ++++-- .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h | 20 +- epochX/cudacpp/gg_ttgg.mad/src/rambo.h | 8 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 18 +- epochX/cudacpp/gg_ttgg.sa/COPYRIGHT | 1 + .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h | 32 +-- .../gg_ttgg.sa/SubProcesses/BridgeKernels.cc | 9 +- .../gg_ttgg.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_ttgg.sa/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gg_ttgg.sa/SubProcesses/EventStatistics.h | 4 +- .../gg_ttgg.sa/SubProcesses/GpuAbstraction.h | 71 ++++++ .../gg_ttgg.sa/SubProcesses/GpuRuntime.h | 85 +++++++ .../gg_ttgg.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_ttgg.sa/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_ttgg.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.h | 10 +- .../P1_Sigma_sm_gg_ttxgg/CudaRuntime.h | 1 - .../P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h | 1 + .../P1_Sigma_sm_gg_ttxgg/GpuRuntime.h | 1 + .../P1_Sigma_sm_gg_ttxgg/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gg_ttgg.sa/SubProcesses/fbridge.cc | 16 +- .../gg_ttgg.sa/SubProcesses/fsampler.cc | 8 +- .../gg_ttgg.sa/SubProcesses/runTest.cc | 12 +- .../gg_ttgg.sa/SubProcesses/testmisc.cc | 8 +- .../gg_ttgg.sa/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h | 4 +- .../cudacpp/gg_ttgg.sa/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_ttgg.sa/src/rambo.h | 8 +- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 26 +- epochX/cudacpp/gg_ttggg.mad/COPYRIGHT | 1 + .../gg_ttggg.mad/SubProcesses/Bridge.h | 32 +-- .../SubProcesses/BridgeKernels.cc | 9 +- .../gg_ttggg.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_ttggg.mad/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../SubProcesses/GpuAbstraction.h | 71 ++++++ .../gg_ttggg.mad/SubProcesses/GpuRuntime.h | 85 +++++++ .../gg_ttggg.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_ttggg.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gg_ttxggg/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttxggg/CudaRuntime.h | 1 - .../P1_gg_ttxggg/GpuAbstraction.h | 1 + .../SubProcesses/P1_gg_ttxggg/GpuRuntime.h | 1 + .../SubProcesses/P1_gg_ttxggg/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gg_ttggg.mad/SubProcesses/fbridge.cc | 16 +- .../gg_ttggg.mad/SubProcesses/fsampler.cc | 8 +- .../gg_ttggg.mad/SubProcesses/runTest.cc | 12 +- .../gg_ttggg.mad/SubProcesses/testmisc.cc | 8 +- .../gg_ttggg.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h | 4 +- .../cudacpp/gg_ttggg.mad/src/Parameters_sm.cc | 4 +- .../cudacpp/gg_ttggg.mad/src/Parameters_sm.h | 10 +- .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk | 23 +- .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_ttggg.mad/src/rambo.h | 8 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 16 +- epochX/cudacpp/gg_ttggg.sa/COPYRIGHT | 1 + .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h | 32 +-- .../gg_ttggg.sa/SubProcesses/BridgeKernels.cc | 9 +- .../gg_ttggg.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gg_ttggg.sa/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../gg_ttggg.sa/SubProcesses/GpuAbstraction.h | 71 ++++++ .../gg_ttggg.sa/SubProcesses/GpuRuntime.h | 85 +++++++ .../gg_ttggg.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gg_ttggg.sa/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gg_ttggg.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.h | 10 +- .../P1_Sigma_sm_gg_ttxggg/CudaRuntime.h | 1 - .../P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h | 1 + .../P1_Sigma_sm_gg_ttxggg/GpuRuntime.h | 1 + .../P1_Sigma_sm_gg_ttxggg/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gg_ttggg.sa/SubProcesses/fbridge.cc | 16 +- .../gg_ttggg.sa/SubProcesses/fsampler.cc | 8 +- .../gg_ttggg.sa/SubProcesses/runTest.cc | 12 +- .../gg_ttggg.sa/SubProcesses/testmisc.cc | 8 +- .../gg_ttggg.sa/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h | 4 +- .../cudacpp/gg_ttggg.sa/src/Parameters_sm.cc | 4 +- .../cudacpp/gg_ttggg.sa/src/Parameters_sm.h | 10 +- epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk | 23 +- .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gg_ttggg.sa/src/rambo.h | 8 +- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 30 +-- epochX/cudacpp/gq_ttq.mad/COPYRIGHT | 1 + .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h | 32 +-- .../gq_ttq.mad/SubProcesses/BridgeKernels.cc | 9 +- .../gq_ttq.mad/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gq_ttq.mad/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gq_ttq.mad/SubProcesses/EventStatistics.h | 4 +- .../gq_ttq.mad/SubProcesses/GpuAbstraction.h | 71 ++++++ .../gq_ttq.mad/SubProcesses/GpuRuntime.h | 85 +++++++ .../gq_ttq.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gq_ttq.mad/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gq_ttq.mad/SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gu_ttxu/CPPProcess.h | 10 +- .../SubProcesses/P1_gu_ttxu/CudaRuntime.h | 1 - .../SubProcesses/P1_gu_ttxu/GpuAbstraction.h | 1 + .../SubProcesses/P1_gu_ttxu/GpuRuntime.h | 1 + .../SubProcesses/P1_gu_ttxu/check_sa.cc | 111 +++++---- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gux_ttxux/CPPProcess.h | 10 +- .../SubProcesses/P1_gux_ttxux/CudaRuntime.h | 1 - .../P1_gux_ttxux/GpuAbstraction.h | 1 + .../SubProcesses/P1_gux_ttxux/GpuRuntime.h | 1 + .../SubProcesses/P1_gux_ttxux/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../gq_ttq.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../gq_ttq.mad/SubProcesses/fbridge.cc | 16 +- .../gq_ttq.mad/SubProcesses/fsampler.cc | 8 +- .../gq_ttq.mad/SubProcesses/runTest.cc | 12 +- .../gq_ttq.mad/SubProcesses/testmisc.cc | 8 +- .../gq_ttq.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h | 4 +- .../cudacpp/gq_ttq.mad/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h | 10 +- epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gq_ttq.mad/src/rambo.h | 8 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 16 +- epochX/cudacpp/gq_ttq.sa/COPYRIGHT | 1 + .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h | 32 +-- .../gq_ttq.sa/SubProcesses/BridgeKernels.cc | 9 +- .../gq_ttq.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../gq_ttq.sa/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../gq_ttq.sa/SubProcesses/EventStatistics.h | 4 +- .../gq_ttq.sa/SubProcesses/GpuAbstraction.h | 71 ++++++ .../gq_ttq.sa/SubProcesses/GpuRuntime.h | 85 +++++++ .../gq_ttq.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../gq_ttq.sa/SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../gq_ttq.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_gu_ttxu/CPPProcess.h | 10 +- .../P1_Sigma_sm_gu_ttxu/CudaRuntime.h | 1 - .../P1_Sigma_sm_gu_ttxu/GpuAbstraction.h | 1 + .../P1_Sigma_sm_gu_ttxu/GpuRuntime.h | 1 + .../P1_Sigma_sm_gu_ttxu/check_sa.cc | 111 +++++---- .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 62 ++--- .../P1_Sigma_sm_gux_ttxux/CPPProcess.h | 10 +- .../P1_Sigma_sm_gux_ttxux/CudaRuntime.h | 1 - .../P1_Sigma_sm_gux_ttxux/GpuAbstraction.h | 1 + .../P1_Sigma_sm_gux_ttxux/GpuRuntime.h | 1 + .../P1_Sigma_sm_gux_ttxux/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc | 16 +- .../gq_ttq.sa/SubProcesses/fsampler.cc | 8 +- .../cudacpp/gq_ttq.sa/SubProcesses/runTest.cc | 12 +- .../gq_ttq.sa/SubProcesses/testmisc.cc | 8 +- .../cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h | 4 +- epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc | 4 +- epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h | 10 +- epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk | 23 +- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h | 73 ++++-- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h | 28 +-- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h | 12 +- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/gq_ttq.sa/src/rambo.h | 8 +- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 12 +- epochX/cudacpp/heft_gg_h.sa/COPYRIGHT | 1 + .../heft_gg_h.sa/SubProcesses/Bridge.h | 32 +-- .../SubProcesses/BridgeKernels.cc | 9 +- .../heft_gg_h.sa/SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../heft_gg_h.sa/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../SubProcesses/GpuAbstraction.h | 71 ++++++ .../heft_gg_h.sa/SubProcesses/GpuRuntime.h | 85 +++++++ .../heft_gg_h.sa/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../heft_gg_h.sa/SubProcesses/MemoryBuffers.h | 64 ++--- .../P1_Sigma_heft_gg_h/CPPProcess.cc | 62 ++--- .../P1_Sigma_heft_gg_h/CPPProcess.h | 10 +- .../P1_Sigma_heft_gg_h/CudaRuntime.h | 1 - .../P1_Sigma_heft_gg_h/GpuAbstraction.h | 1 + .../P1_Sigma_heft_gg_h/GpuRuntime.h | 1 + .../P1_Sigma_heft_gg_h/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../heft_gg_h.sa/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../heft_gg_h.sa/SubProcesses/fbridge.cc | 16 +- .../heft_gg_h.sa/SubProcesses/fsampler.cc | 8 +- .../heft_gg_h.sa/SubProcesses/runTest.cc | 12 +- .../heft_gg_h.sa/SubProcesses/testmisc.cc | 8 +- .../heft_gg_h.sa/SubProcesses/testxxx.cc | 14 +- .../cudacpp/heft_gg_h.sa/src/HelAmps_heft.h | 4 +- .../heft_gg_h.sa/src/Parameters_heft.cc | 4 +- .../heft_gg_h.sa/src/Parameters_heft.h | 10 +- .../cudacpp/heft_gg_h.sa/src/cudacpp_src.mk | 23 +- .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h | 73 ++++-- .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h | 28 +-- .../cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h | 12 +- .../cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/heft_gg_h.sa/src/rambo.h | 8 +- .../CODEGEN_mad_pp_tt012j_log.txt | 64 ++--- epochX/cudacpp/pp_tt012j.mad/COPYRIGHT | 1 + .../pp_tt012j.mad/SubProcesses/Bridge.h | 32 +-- .../SubProcesses/BridgeKernels.cc | 9 +- .../SubProcesses/BridgeKernels.h | 8 +- .../SubProcesses/CommonRandomNumberKernel.cc | 5 +- .../SubProcesses/CrossSectionKernels.cc | 7 +- .../SubProcesses/CrossSectionKernels.h | 6 +- .../pp_tt012j.mad/SubProcesses/CudaRuntime.h | 85 ------- .../SubProcesses/CurandRandomNumberKernel.cc | 12 +- .../SubProcesses/EventStatistics.h | 4 +- .../SubProcesses/GpuAbstraction.h | 71 ++++++ .../pp_tt012j.mad/SubProcesses/GpuRuntime.h | 85 +++++++ .../pp_tt012j.mad/SubProcesses/MadgraphTest.h | 8 +- .../SubProcesses/MatrixElementKernels.cc | 26 +- .../SubProcesses/MatrixElementKernels.h | 8 +- .../SubProcesses/MemoryAccessAmplitudes.h | 2 +- .../SubProcesses/MemoryAccessCouplings.h | 2 +- .../SubProcesses/MemoryAccessCouplingsFixed.h | 2 +- .../SubProcesses/MemoryAccessDenominators.h | 2 +- .../SubProcesses/MemoryAccessGs.h | 2 +- .../SubProcesses/MemoryAccessHelpers.h | 4 +- .../SubProcesses/MemoryAccessMatrixElements.h | 2 +- .../SubProcesses/MemoryAccessMomenta.h | 6 +- .../SubProcesses/MemoryAccessNumerators.h | 2 +- .../SubProcesses/MemoryAccessRandomNumbers.h | 4 +- .../SubProcesses/MemoryAccessVectors.h | 4 +- .../SubProcesses/MemoryAccessWavefunctions.h | 2 +- .../SubProcesses/MemoryBuffers.h | 64 ++--- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 62 ++--- .../SubProcesses/P0_gg_ttx/CPPProcess.h | 10 +- .../SubProcesses/P0_gg_ttx/CudaRuntime.h | 1 - .../SubProcesses/P0_gg_ttx/GpuAbstraction.h | 1 + .../SubProcesses/P0_gg_ttx/GpuRuntime.h | 1 + .../SubProcesses/P0_gg_ttx/check_sa.cc | 111 +++++---- .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 62 ++--- .../SubProcesses/P0_uux_ttx/CPPProcess.h | 10 +- .../SubProcesses/P0_uux_ttx/CudaRuntime.h | 1 - .../SubProcesses/P0_uux_ttx/GpuAbstraction.h | 1 + .../SubProcesses/P0_uux_ttx/GpuRuntime.h | 1 + .../SubProcesses/P0_uux_ttx/check_sa.cc | 111 +++++---- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gg_ttxg/CPPProcess.h | 10 +- .../SubProcesses/P1_gg_ttxg/CudaRuntime.h | 1 - .../SubProcesses/P1_gg_ttxg/GpuAbstraction.h | 1 + .../SubProcesses/P1_gg_ttxg/GpuRuntime.h | 1 + .../SubProcesses/P1_gg_ttxg/check_sa.cc | 111 +++++---- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gu_ttxu/CPPProcess.h | 10 +- .../SubProcesses/P1_gu_ttxu/CudaRuntime.h | 1 - .../SubProcesses/P1_gu_ttxu/GpuAbstraction.h | 1 + .../SubProcesses/P1_gu_ttxu/GpuRuntime.h | 1 + .../SubProcesses/P1_gu_ttxu/check_sa.cc | 111 +++++---- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_gux_ttxux/CPPProcess.h | 10 +- .../SubProcesses/P1_gux_ttxux/CudaRuntime.h | 1 - .../P1_gux_ttxux/GpuAbstraction.h | 1 + .../SubProcesses/P1_gux_ttxux/GpuRuntime.h | 1 + .../SubProcesses/P1_gux_ttxux/check_sa.cc | 111 +++++---- .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 62 ++--- .../SubProcesses/P1_uux_ttxg/CPPProcess.h | 10 +- .../SubProcesses/P1_uux_ttxg/CudaRuntime.h | 1 - .../SubProcesses/P1_uux_ttxg/GpuAbstraction.h | 1 + .../SubProcesses/P1_uux_ttxg/GpuRuntime.h | 1 + .../SubProcesses/P1_uux_ttxg/check_sa.cc | 111 +++++---- .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_gg_ttxgg/CPPProcess.h | 10 +- .../SubProcesses/P2_gg_ttxgg/CudaRuntime.h | 1 - .../SubProcesses/P2_gg_ttxgg/GpuAbstraction.h | 1 + .../SubProcesses/P2_gg_ttxgg/GpuRuntime.h | 1 + .../SubProcesses/P2_gg_ttxgg/check_sa.cc | 111 +++++---- .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_gg_ttxuux/CPPProcess.h | 10 +- .../SubProcesses/P2_gg_ttxuux/CudaRuntime.h | 1 - .../P2_gg_ttxuux/GpuAbstraction.h | 1 + .../SubProcesses/P2_gg_ttxuux/GpuRuntime.h | 1 + .../SubProcesses/P2_gg_ttxuux/check_sa.cc | 111 +++++---- .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_gu_ttxgu/CPPProcess.h | 10 +- .../SubProcesses/P2_gu_ttxgu/CudaRuntime.h | 1 - .../SubProcesses/P2_gu_ttxgu/GpuAbstraction.h | 1 + .../SubProcesses/P2_gu_ttxgu/GpuRuntime.h | 1 + .../SubProcesses/P2_gu_ttxgu/check_sa.cc | 111 +++++---- .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_gux_ttxgux/CPPProcess.h | 10 +- .../SubProcesses/P2_gux_ttxgux/CudaRuntime.h | 1 - .../P2_gux_ttxgux/GpuAbstraction.h | 1 + .../SubProcesses/P2_gux_ttxgux/GpuRuntime.h | 1 + .../SubProcesses/P2_gux_ttxgux/check_sa.cc | 111 +++++---- .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uc_ttxuc/CPPProcess.h | 10 +- .../SubProcesses/P2_uc_ttxuc/CudaRuntime.h | 1 - .../SubProcesses/P2_uc_ttxuc/GpuAbstraction.h | 1 + .../SubProcesses/P2_uc_ttxuc/GpuRuntime.h | 1 + .../SubProcesses/P2_uc_ttxuc/check_sa.cc | 111 +++++---- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.h | 10 +- .../SubProcesses/P2_ucx_ttxucx/CudaRuntime.h | 1 - .../P2_ucx_ttxucx/GpuAbstraction.h | 1 + .../SubProcesses/P2_ucx_ttxucx/GpuRuntime.h | 1 + .../SubProcesses/P2_ucx_ttxucx/check_sa.cc | 111 +++++---- .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uu_ttxuu/CPPProcess.h | 10 +- .../SubProcesses/P2_uu_ttxuu/CudaRuntime.h | 1 - .../SubProcesses/P2_uu_ttxuu/GpuAbstraction.h | 1 + .../SubProcesses/P2_uu_ttxuu/GpuRuntime.h | 1 + .../SubProcesses/P2_uu_ttxuu/check_sa.cc | 111 +++++---- .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uux_ttxccx/CPPProcess.h | 10 +- .../SubProcesses/P2_uux_ttxccx/CudaRuntime.h | 1 - .../P2_uux_ttxccx/GpuAbstraction.h | 1 + .../SubProcesses/P2_uux_ttxccx/GpuRuntime.h | 1 + .../SubProcesses/P2_uux_ttxccx/check_sa.cc | 111 +++++---- .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uux_ttxgg/CPPProcess.h | 10 +- .../SubProcesses/P2_uux_ttxgg/CudaRuntime.h | 1 - .../P2_uux_ttxgg/GpuAbstraction.h | 1 + .../SubProcesses/P2_uux_ttxgg/GpuRuntime.h | 1 + .../SubProcesses/P2_uux_ttxgg/check_sa.cc | 111 +++++---- .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uux_ttxuux/CPPProcess.h | 10 +- .../SubProcesses/P2_uux_ttxuux/CudaRuntime.h | 1 - .../P2_uux_ttxuux/GpuAbstraction.h | 1 + .../SubProcesses/P2_uux_ttxuux/GpuRuntime.h | 1 + .../SubProcesses/P2_uux_ttxuux/check_sa.cc | 111 +++++---- .../P2_uxcx_ttxuxcx/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h | 10 +- .../P2_uxcx_ttxuxcx/CudaRuntime.h | 1 - .../P2_uxcx_ttxuxcx/GpuAbstraction.h | 1 + .../SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h | 1 + .../SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc | 111 +++++---- .../P2_uxux_ttxuxux/CPPProcess.cc | 62 ++--- .../SubProcesses/P2_uxux_ttxuxux/CPPProcess.h | 10 +- .../P2_uxux_ttxuxux/CudaRuntime.h | 1 - .../P2_uxux_ttxuxux/GpuAbstraction.h | 1 + .../SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h | 1 + .../SubProcesses/P2_uxux_ttxuxux/check_sa.cc | 111 +++++---- .../SubProcesses/RamboSamplingKernels.cc | 20 +- .../SubProcesses/RamboSamplingKernels.h | 6 +- .../SubProcesses/RandomNumberKernels.h | 6 +- .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 232 +++++++++++------- .../pp_tt012j.mad/SubProcesses/fbridge.cc | 16 +- .../pp_tt012j.mad/SubProcesses/fsampler.cc | 8 +- .../pp_tt012j.mad/SubProcesses/runTest.cc | 12 +- .../pp_tt012j.mad/SubProcesses/testmisc.cc | 8 +- .../pp_tt012j.mad/SubProcesses/testxxx.cc | 14 +- epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h | 4 +- .../pp_tt012j.mad/src/Parameters_sm.cc | 4 +- .../cudacpp/pp_tt012j.mad/src/Parameters_sm.h | 10 +- .../cudacpp/pp_tt012j.mad/src/cudacpp_src.mk | 23 +- .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h | 73 ++++-- .../pp_tt012j.mad/src/mgOnGpuCxtypes.h | 28 +-- .../pp_tt012j.mad/src/mgOnGpuFptypes.h | 12 +- .../pp_tt012j.mad/src/mgOnGpuVectors.h | 18 +- epochX/cudacpp/pp_tt012j.mad/src/rambo.h | 8 +- 859 files changed, 11337 insertions(+), 8348 deletions(-) create mode 100644 epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h rename epochX/cudacpp/ee_mumu.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%) delete mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h create mode 120000 epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h create mode 100644 epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h rename epochX/cudacpp/{gg_tt.sa/SubProcesses/CudaRuntime.h => ee_mumu.sa/SubProcesses/GpuRuntime.h} (62%) delete mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h create mode 120000 epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h create mode 100644 epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h rename epochX/cudacpp/{ee_mumu.sa/SubProcesses/CudaRuntime.h => gg_tt.sa/SubProcesses/GpuRuntime.h} (62%) delete mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h create mode 100644 epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h rename epochX/cudacpp/gg_tt01g.mad/SubProcesses/{CudaRuntime.h => GpuRuntime.h} (62%) delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h create mode 120000 epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h delete mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h delete mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h create mode 120000 epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h delete mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h delete mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h create mode 120000 epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h delete mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h create mode 120000 epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h delete mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h create mode 100644 epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h delete mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h create mode 120000 epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 36b42987c5..dd0f31341f 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005498409271240234  +DEBUG: model prefixing takes 0.005403280258178711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.102 s +Wrote files for 8 helas calls in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.203 s +ALOHA: aloha creates 3 routines in 0.200 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.260 s +ALOHA: aloha creates 7 routines in 0.537 s FFV1 FFV1 FFV2 @@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.900s -user 0m1.697s -sys 0m0.195s +real 0m2.147s +user 0m1.627s +sys 0m0.231s Code generation completed in 2 seconds ************************************************************ * * @@ -277,7 +277,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -307,7 +307,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 9193aa2382..83e5b15013 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ }; __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() }; #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype cIPC[6]; #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -286,7 +287,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -343,7 +344,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -402,7 +403,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -449,8 +450,8 @@ namespace mg5amcCpu { 1, -1, 1, 1 }, { 1, -1, -1, -1 }, { 1, -1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -490,9 +491,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); @@ -529,7 +530,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -594,12 +595,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -620,7 +621,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -746,9 +747,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -772,7 +773,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -792,7 +793,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -806,9 +807,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -836,7 +840,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1046,7 +1050,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 77b610753c..0b29ffb3ff 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 9fa30cfd7f..e878fcd28e 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index 0b4be4d5ed..cffc5d3bff 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 64d0b8e761..2a6d960581 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -235,7 +235,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -259,7 +259,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h +++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 636fab0372..20d35a4a26 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00569605827331543  +DEBUG: model prefixing takes 0.005757331848144531  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.271 s +ALOHA: aloha creates 4 routines in 0.267 s FFV1 FFV1 FFV2 @@ -201,6 +201,6 @@ INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu quit real 0m0.662s -user 0m0.604s -sys 0m0.052s +user 0m0.596s +sys 0m0.051s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 87bcecccd9..13429436af 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ }; __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() }; #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype cIPC[6]; #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -284,7 +285,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -341,7 +342,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -400,7 +401,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -447,8 +448,8 @@ namespace mg5amcCpu { 1, -1, 1, 1 }, { 1, -1, -1, -1 }, { 1, -1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -488,9 +489,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); @@ -527,7 +528,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -592,12 +593,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -618,7 +619,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -744,9 +745,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -770,7 +771,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -790,7 +791,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -804,9 +805,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -834,7 +838,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1044,7 +1048,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 77b610753c..0b29ffb3ff 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 9fa30cfd7f..e878fcd28e 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index 0b4be4d5ed..cffc5d3bff 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 64d0b8e761..2a6d960581 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -235,7 +235,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -259,7 +259,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h +++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 360771ac98..75c84e12fb 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005492210388183594  +DEBUG: model prefixing takes 0.005261659622192383  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,11 +191,11 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.101 s +Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.713s -user 0m1.482s -sys 0m0.227s +real 0m1.690s +user 0m1.458s +sys 0m0.220s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 0db09949ad..5542e5323b 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005459308624267578  +DEBUG: model prefixing takes 0.005713224411010742  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.145 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.545s -user 0m0.487s -sys 0m0.049s +real 0m0.623s +user 0m0.466s +sys 0m0.061s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index d390883453..e7dbb05570 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -299,7 +300,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -356,7 +357,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -415,7 +416,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -462,8 +463,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -503,9 +504,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -541,7 +542,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -606,12 +607,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -632,7 +633,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -758,9 +759,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -784,7 +785,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -804,7 +805,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -818,9 +819,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -848,7 +852,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1058,7 +1062,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index 932f123fea..5f2f4391b9 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.sa/src/rambo.h +++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index b3d319e039..f38b6ec6e6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005671977996826172  +DEBUG: model prefixing takes 0.005505561828613281  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s -Wrote files for 46 helas calls in 0.247 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.330 s +ALOHA: aloha creates 5 routines in 0.324 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.334s -user 0m2.083s -sys 0m0.238s -Code generation completed in 2 seconds +real 0m2.484s +user 0m2.030s +sys 0m0.256s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -312,7 +312,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -342,7 +342,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..f20c229897 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +610,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +636,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +762,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +788,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +808,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +822,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +855,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index bfab81142d..3c7715b235 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -830,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index 3901ddcb20..d4b3c0445c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 37ba5c7297..00ae96c5fb 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005791187286376953  +DEBUG: model prefixing takes 0.0055010318756103516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,14 +191,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s -Wrote files for 36 helas calls in 0.153 s +Wrote files for 36 helas calls in 0.184 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.331 s +ALOHA: aloha creates 5 routines in 0.325 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.208s -user 0m1.988s -sys 0m0.221s +real 0m2.571s +user 0m1.941s +sys 0m0.238s Code generation completed in 2 seconds ************************************************************ * * @@ -281,7 +281,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -311,7 +311,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index afeebde3c6..0e4d5d1157 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -830,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index adda711aad..ee1a51555d 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005533933639526367  +DEBUG: model prefixing takes 0.0054416656494140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.787s -user 0m0.730s -sys 0m0.049s -Code generation completed in 0 seconds +real 0m0.803s +user 0m0.731s +sys 0m0.066s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 2988a13b82..2e02593919 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -499,7 +500,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -556,7 +557,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -615,7 +616,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -678,8 +679,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -720,9 +721,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -759,7 +760,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -824,12 +825,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -850,7 +851,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -976,9 +977,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1002,7 +1003,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1022,7 +1023,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1036,9 +1037,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1066,7 +1070,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1276,7 +1280,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 2c2fae1608..3a2b1ad647 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057299137115478516  +DEBUG: model prefixing takes 0.0053348541259765625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.163 s +1 processes with 123 diagrams generated in 0.156 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.433 s -Wrote files for 222 helas calls in 0.711 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.437 s +Wrote files for 222 helas calls in 0.735 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.336 s +ALOHA: aloha creates 5 routines in 0.441 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -255,10 +255,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.329s -user 0m3.091s -sys 0m0.226s -Code generation completed in 4 seconds +real 0m3.582s +user 0m3.061s +sys 0m0.243s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -284,7 +284,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -314,7 +314,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h index 2f000e33d1..6a7d9c05c0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -1,24 +1,17 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + #ifndef MG5AMC_GPUABSTRACTION_H #define MG5AMC_GPUABSTRACTION_H 1 #include -#ifdef MGONGPUCPP_GPUIMPL -#define MGONGPUCPP_CUDACC 1 -#endif - -#ifdef __HIPCC__ -#include "hip/hip_runtime.h" -#define MGONGPUCPP_HIPCC 1 -#endif - -#ifdef MGONGPUCPP_CUDACC - -// Defines correct compiler -#define MGONGPUCPP_GPUIMPL MGONGPUCPP_GPUIMPL - //-------------------------------------------------------------------------- +#ifdef __CUDACC__ + #define gpuError_t cudaError_t #define gpuPeekAtLastError cudaPeekAtLastError #define gpuGetErrorString cudaGetErrorString @@ -44,12 +37,9 @@ //-------------------------------------------------------------------------- -#elif defined MGONGPUCPP_HIPCC +#elif defined __HIPCC__ -// Defines correct compiler -#define MGONGPUCPP_GPUIMPL __HCC__ - -//-------------------------------------------------------------------------- +#include "hip/hip_runtime.h" #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError @@ -74,6 +64,8 @@ #define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) #define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) +//-------------------------------------------------------------------------- + #endif -#endif // MG5AMC_GPUABSTRACTION_H \ No newline at end of file +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index 3a957ee2ca..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -211,11 +211,11 @@ namespace mg5amcGpu // ... 0d1. Compute good helicity mask on the device gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 19bc1e7973..2f4b1f9d0e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2417,7 +2418,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2474,7 +2475,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2533,7 +2534,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2628,8 +2629,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2671,9 +2672,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2711,7 +2712,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2776,12 +2777,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2802,7 +2803,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2928,9 +2929,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -2954,7 +2955,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -2974,7 +2975,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -2988,9 +2989,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3018,7 +3022,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3228,7 +3232,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 04f7c62976..deb1358992 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index 8fe4c22145..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,20 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; #elif defined __HIPCC__ wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -783,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -795,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -820,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -876,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -897,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -923,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -968,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1064,12 +1077,12 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif +#elif defined MGONGPU_CUCXTYPE_CXSMPL + << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... #endif @@ -1079,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index 904cb78a72..de327f2321 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 88173dcc94..55d03f1252 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,22 +68,25 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 #else #undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ @@ -88,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -136,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -147,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -177,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -191,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index e91f5927d6..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -9,8 +9,6 @@ #include "mgOnGpuCxtypes.h" #include "mgOnGpuFptypes.h" -#include "GpuAbstraction.h" - #include //========================================================================== @@ -34,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -133,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -155,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -807,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -855,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -881,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 3c3686e228..1b6c420503 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005596637725830078  +DEBUG: model prefixing takes 0.005376100540161133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.166 s +1 processes with 123 diagrams generated in 0.156 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.442 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.427 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s +ALOHA: aloha creates 5 routines in 0.319 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.506s -user 0m1.438s -sys 0m0.059s -Code generation completed in 2 seconds +real 0m1.461s +user 0m1.381s +sys 0m0.050s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index f9016eaa88..d59cc349e3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2474,7 +2475,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2531,7 +2532,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2590,7 +2591,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2685,8 +2686,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2728,9 +2729,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2768,7 +2769,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2833,12 +2834,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2859,7 +2860,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2985,9 +2986,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -3011,7 +3012,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -3031,7 +3032,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -3045,9 +3046,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3075,7 +3079,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3285,7 +3289,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 04f7c62976..deb1358992 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 2480a22f8d..f222e5a6b5 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005784511566162109  +DEBUG: model prefixing takes 0.005517005920410156  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.929 s +1 processes with 1240 diagrams generated in 1.861 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.718 s -Wrote files for 2281 helas calls in 18.893 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.528 s +Wrote files for 2281 helas calls in 18.450 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.322 s +ALOHA: aloha creates 5 routines in 0.314 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.319 s +ALOHA: aloha creates 10 routines in 0.309 s VVV1 VVV1 FFV1 @@ -257,9 +257,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.815s -user 0m29.332s -sys 0m0.380s +real 0m29.049s +user 0m28.554s +sys 0m0.393s Code generation completed in 30 seconds ************************************************************ * * @@ -286,7 +286,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -316,7 +316,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 19e6cd201c..a478ecb28e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g g WEIGHTED<=5 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -30018,7 +30019,7 @@ namespace mg5amcCpu { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -30075,7 +30076,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -30134,7 +30135,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -30293,8 +30294,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1, 1 }, { 1, 1, 1, -1, 1, 1, -1 }, { 1, 1, 1, -1, 1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -30337,9 +30338,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -30378,7 +30379,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -30443,12 +30444,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -30469,7 +30470,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -30595,9 +30596,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -30621,7 +30622,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -30641,7 +30642,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -30655,9 +30656,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -30685,7 +30689,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -30895,7 +30899,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 2565923dde..fff95b66e2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 0970bf8b4c..2720870321 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005753755569458008  +DEBUG: model prefixing takes 0.005664825439453125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.912 s +1 processes with 1240 diagrams generated in 1.872 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.716 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.352 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.290s -user 0m13.123s -sys 0m0.115s +real 0m12.978s +user 0m12.813s +sys 0m0.111s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index c2f8607428..fa23301c50 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g g WEIGHTED<=5 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -31908,7 +31909,7 @@ namespace mg5amcCpu { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -31965,7 +31966,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -32024,7 +32025,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -32183,8 +32184,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1, 1 }, { 1, 1, 1, -1, 1, 1, -1 }, { 1, 1, 1, -1, 1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -32227,9 +32228,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -32268,7 +32269,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -32333,12 +32334,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -32359,7 +32360,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -32485,9 +32486,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -32511,7 +32512,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -32531,7 +32532,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -32545,9 +32546,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -32575,7 +32579,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -32785,7 +32789,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 2565923dde..fff95b66e2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 2c0e77fafd..bb803498ee 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005677223205566406  +DEBUG: model prefixing takes 0.005455732345581055  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -Wrote files for 32 helas calls in 0.231 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +Wrote files for 32 helas calls in 0.216 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.364 s +ALOHA: aloha creates 2 routines in 0.143 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.137 s +ALOHA: aloha creates 4 routines in 0.130 s FFV1 FFV1 FFV1 @@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.934s -user 0m1.748s -sys 0m0.220s -Code generation completed in 3 seconds +real 0m1.916s +user 0m1.672s +sys 0m0.240s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -323,7 +323,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -353,7 +353,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 6242b019fa..a376b0c455 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 90788b2c75..41f17b9fb0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index cd4e6de668..45000c7246 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index c06dcbb252..8b92ea0bd6 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index a6eb185434..a3615ec77a 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h +++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index f659f6bb8d..5a07808142 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054836273193359375  +DEBUG: model prefixing takes 0.005926370620727539  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.082 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -211,7 +211,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.179 s FFV1 FFV1 FFV1 @@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.709s -user 0m0.586s -sys 0m0.064s -Code generation completed in 0 seconds +real 0m1.076s +user 0m0.601s +sys 0m0.061s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 90e90b3aa9..c1543791ca 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -336,7 +337,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -393,7 +394,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -452,7 +453,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -515,8 +516,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -557,9 +558,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -596,7 +597,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -661,12 +662,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -687,7 +688,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -813,9 +814,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -839,7 +840,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -859,7 +860,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -873,9 +874,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -903,7 +907,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1113,7 +1117,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 76c9403933..a9294d1fea 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -336,7 +337,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -393,7 +394,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -452,7 +453,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -515,8 +516,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -557,9 +558,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -596,7 +597,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -661,12 +662,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -687,7 +688,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -813,9 +814,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -839,7 +840,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -859,7 +860,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -873,9 +874,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -903,7 +907,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1113,7 +1117,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index cd4e6de668..45000c7246 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index c06dcbb252..8b92ea0bd6 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index a6eb185434..a3615ec77a 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h +++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 800492306f..9bac4b3aae 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.062 s +ALOHA: aloha creates 1 routines in 0.060 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.471s -user 0m0.367s -sys 0m0.052s -Code generation completed in 0 seconds +real 0m0.414s +user 0m0.350s +sys 0m0.059s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT +++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h index d65c9d6e04..85c3c9ed1c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h index 8109470148..78004e66cc 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_heft.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 526bd7d296..3b6085c784 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu //__device__ const fptype* cIPD = nullptr; // unused as nparam=0 __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0 __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -268,7 +269,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -325,7 +326,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -384,7 +385,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -419,8 +420,8 @@ namespace mg5amcCpu { -1, 1, 0 }, { 1, -1, 0 }, { 1, 1, 0 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -459,9 +460,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory //const fptype tIPD[0] = { ... }; // nparam=0 //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0 - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -495,7 +496,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -560,12 +561,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -586,7 +587,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -712,9 +713,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -738,7 +739,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -758,7 +759,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -772,9 +773,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -802,7 +806,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1012,7 +1016,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h index dbc5aa0e4e..e1caef360b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc index a1c3cdc238..688cb8167b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h index eae9ff5242..dbff117235 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc index e5442756b1..d3d6058b46 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index 790485fee0..c2be5bba97 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -28,7 +28,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -94,7 +94,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -230,7 +230,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -247,7 +247,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -298,7 +298,7 @@ namespace mg5amcCpu // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk index 0bd815c9b3..998d3c84fa 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h index b247654dcf..da4ba36ad8 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index ff161c336f..adfd21027c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005424976348876953  +DEBUG: model prefixing takes 0.0053827762603759766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.139 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.876 s +65 processes with 1119 diagrams generated in 1.869 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.312 s -Wrote files for 810 helas calls in 3.308 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.297 s +Wrote files for 810 helas calls in 3.533 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.342 s +ALOHA: aloha creates 5 routines in 0.333 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.321 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.073s -user 0m8.514s -sys 0m0.464s +real 0m9.184s +user 0m8.370s +sys 0m0.508s Code generation completed in 9 seconds ************************************************************ * * @@ -1057,7 +1057,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -1087,7 +1087,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT index a134b5fef9..84a883fbb0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT +++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Joergen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index bf8b5e024d..89437b4c42 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -23,7 +23,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +150,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +187,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +209,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +233,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,7 +249,7 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? @@ -262,7 +262,7 @@ namespace mg5amcCpu process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +276,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +291,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +341,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +396,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..6a7d9c05c0 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,71 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#include "hip/hip_runtime.h" + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h index ef40624c88..a64c05c26a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -22,7 +22,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +201,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -307,6 +307,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 7f14b5e299..40d8bdea5f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -609,12 +610,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +636,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +762,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +788,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +808,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +822,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +855,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1065,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 448175be9d..f8a20b77fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 20496eaa70..5f57cf55f3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ WEIGHTED<=2 // Process: s s~ > t t~ WEIGHTED<=2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -279,7 +280,7 @@ namespace mg5amcCpu { 9, 3 }, { 3, 9 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -336,7 +337,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -395,7 +396,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -442,8 +443,8 @@ namespace mg5amcCpu { -1, 1, -1, -1 }, { -1, 1, 1, 1 }, { -1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -483,9 +484,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -521,7 +522,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -586,12 +587,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -612,7 +613,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -738,9 +739,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -764,7 +765,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -784,7 +785,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -798,9 +799,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -828,7 +832,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1038,7 +1042,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index e166fa1652..6498b91441 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index afeebde3c6..0e4d5d1157 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -830,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index b7e3475679..e098c03e3a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 0f999663da..7308f8a2c7 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 87830582d7..b37df5d33f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ g WEIGHTED<=3 @1 // Process: s s~ > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { -1, 1, 1, 1, 1 }, { -1, 1, 1, -1, -1 }, { -1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -666,12 +667,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +693,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +819,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +845,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +865,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +879,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +912,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1122,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index f8bdb38aee..fc7c0d8196 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 9051b3108d..b4df38fb35 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2417,7 +2418,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2474,7 +2475,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2533,7 +2534,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2628,8 +2629,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2671,9 +2672,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2711,7 +2712,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2776,12 +2777,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2802,7 +2803,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2928,9 +2929,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -2954,7 +2955,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -2974,7 +2975,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -2988,9 +2989,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3018,7 +3022,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3228,7 +3232,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 9f43559181..511b053c2a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 866433ae8b..bc38d1f109 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g g > t t~ d d~ WEIGHTED<=4 @2 // Process: g g > t t~ s s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, -1 }, { 1, 1, 1, -1, 1, 1 }, { 1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index f26b60c5bb..c411623fc8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 1be98364ee..a17bd3518e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ g d WEIGHTED<=4 @2 // Process: g s > t t~ g s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, -1, 1, -1, -1, 1 }, { 1, -1, 1, -1, 1, -1 }, { 1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 853175b477..9c820a5ddb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index dfb05016f5..6a53d09c8e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ g d~ WEIGHTED<=4 @2 // Process: g s~ > t t~ g s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, -1 }, { 1, 1, 1, -1, 1, 1 }, { 1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index e60cb5b6d7..a5a285b22d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index ecef3e57ca..fedf955b6a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -51,7 +50,7 @@ // Process: c s > t t~ c s WEIGHTED<=4 @2 // Process: d s > t t~ d s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +84,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -95,7 +94,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -123,13 +122,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -156,7 +155,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -192,7 +191,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -205,8 +204,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -387,7 +388,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -444,7 +445,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -503,7 +504,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -598,8 +599,8 @@ namespace mg5amcCpu { -1, -1, 1, -1, -1, 1 }, { -1, -1, 1, -1, 1, -1 }, { -1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -641,9 +642,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -681,7 +682,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -746,12 +747,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -772,7 +773,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -898,9 +899,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -924,7 +925,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -944,7 +945,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -958,9 +959,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -988,7 +992,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1198,7 +1202,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 5329710b87..8c84687f8a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -112,7 +112,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -125,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -155,7 +155,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index e4f9dee3a2..fc99b3bfae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -57,7 +56,7 @@ // Process: s c~ > t t~ s c~ WEIGHTED<=4 @2 // Process: s d~ > t t~ s d~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +90,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -101,7 +100,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -129,13 +128,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -162,7 +161,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -198,7 +197,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -211,8 +210,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -393,7 +394,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -450,7 +451,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -509,7 +510,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -604,8 +605,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -647,9 +648,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -687,7 +688,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -752,12 +753,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -778,7 +779,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -904,9 +905,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -930,7 +931,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -950,7 +951,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -964,9 +965,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -994,7 +998,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1204,7 +1208,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index 391789dc81..da747c3465 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -118,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -131,7 +131,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -161,7 +161,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 302d63e31d..97912e5855 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d > t t~ d d WEIGHTED<=4 @2 // Process: s s > t t~ s s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { -1, -1, 1, -1, -1, 1 }, { -1, -1, 1, -1, 1, -1 }, { -1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -856,12 +857,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +883,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1009,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1035,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1069,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1102,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1312,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 2d95f4b170..d8232ea652 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index d0be5131af..be2315b035 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -57,7 +56,7 @@ // Process: s s~ > t t~ c c~ WEIGHTED<=4 @2 // Process: s s~ > t t~ d d~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +90,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -101,7 +100,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -129,13 +128,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -162,7 +161,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -198,7 +197,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -211,8 +210,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -393,7 +394,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -450,7 +451,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -509,7 +510,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -604,8 +605,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -647,9 +648,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -687,7 +688,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -752,12 +753,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -778,7 +779,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -904,9 +905,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -930,7 +931,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -950,7 +951,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -964,9 +965,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -994,7 +998,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1204,7 +1208,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 14490d782f..71fdc6e547 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -118,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -131,7 +131,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -161,7 +161,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 3a2178d534..c83b7be449 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ g g WEIGHTED<=4 @2 // Process: s s~ > t t~ g g WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, 1 }, { -1, 1, 1, -1, 1, -1 }, { -1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1171,12 +1172,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1198,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1324,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1350,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1370,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1384,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1417,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1627,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 1543c29649..e9a24f516d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 70fbbee59f..3ecdb48914 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ d d~ WEIGHTED<=4 @2 // Process: s s~ > t t~ s s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -856,12 +857,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +883,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1009,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1035,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1069,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1102,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1312,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 58cece5c62..d8d3d481ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 7df13a2341..e21d1f0c48 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -51,7 +50,7 @@ // Process: c~ s~ > t t~ c~ s~ WEIGHTED<=4 @2 // Process: d~ s~ > t t~ d~ s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +84,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -95,7 +94,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -123,13 +122,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -156,7 +155,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -192,7 +191,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -205,8 +204,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -387,7 +388,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -444,7 +445,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -503,7 +504,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -598,8 +599,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -641,9 +642,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -681,7 +682,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -746,12 +747,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -772,7 +773,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -898,9 +899,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -924,7 +925,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -944,7 +945,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -958,9 +959,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -988,7 +992,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1198,7 +1202,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 6bd3135c3c..901c6dfcc9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -112,7 +112,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -125,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -155,7 +155,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index f464c27160..527b1d3c8f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d~ d~ > t t~ d~ d~ WEIGHTED<=4 @2 // Process: s~ s~ > t t~ s~ s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -856,12 +857,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +883,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1009,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1035,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1055,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1069,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1102,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1312,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 4e53fa1250..c2ca443c0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc index 3fbf0ffbee..7cac5ab47b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,7 +788,7 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif +#endif /* clang-format on */ // -- CUCOMPLEX or THRUST or STD complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_CUCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,21 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +981,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1077,14 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1092,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 509307506b..f2cfa349da 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,46 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled + override HIP_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -158,41 +180,78 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS - -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -237,7 +296,7 @@ endif # Set the default OMPFLAGS choice ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +352,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +372,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +430,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +445,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +454,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +506,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +523,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,15 +554,16 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif +# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) @@ -509,11 +572,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +596,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +607,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +635,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +647,11 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif #------------------------------------------------------------------------------- @@ -602,16 +668,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -637,17 +703,17 @@ $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PA $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif #------------------------------------------------------------------------------- @@ -659,7 +725,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +738,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +750,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +779,14 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest @@ -829,9 +895,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +916,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk index d4cc628aec..159e19a46d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct -DHIP_LATFORM when compiling for HIP +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 80032e528b..55d03f1252 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,25 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +36,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +68,28 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +#elif defined __HIPCC__ +#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +105,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +157,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +168,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +198,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +212,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..5532e22fa1 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -82,7 +82,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +215,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,7 +255,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== @@ -633,7 +633,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) From fa56bf876a1d8fc0bd4ebfe37e3bcf23c40bb276 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 20:34:54 +0200 Subject: [PATCH 38/96] [jt774] in gg_tt.sa cudacpp.mk, fix autodiscovery of HIP_HOME from 'which hipcc' (on LUMI!) --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index f2cfa349da..c3ebec942a 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -145,7 +145,7 @@ endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif From 41f08d7e47134550d1c86afd64c12f5650eeba02 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 25 Jan 2024 23:36:33 +0200 Subject: [PATCH 39/96] [jt774] in gg_tt.sa cudacpp.mk, remove trailing '/' from CUDA_HOME and HIP_HOME auto discovered from 'which nvcc' and 'which hipcc' --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index c3ebec942a..3fb94ce3bc 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif From 9809728ede533bb675cf18804dca891cc7af9f53 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 00:15:14 +0200 Subject: [PATCH 40/96] [jt774] in gg_tt.sa, replace filesystem by experimental/filesystem for __HIPCC__ in three source code files --- epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h | 13 +++++++++---- epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h | 8 ++++++++ epochX/cudacpp/gg_tt.sa/src/read_slha.cc | 8 ++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 89437b4c42..85dcf2763e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -18,7 +18,11 @@ #include #include #include +#ifdef __HIPCC__ +#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +#else #include +#endif #include #include #include @@ -255,10 +259,11 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h index a64c05c26a..972ef2d4a6 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include +#ifdef __HIPCC__ +#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +#else #include +#endif #include #include #include @@ -219,7 +223,11 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_tt.sa/src/read_slha.cc b/epochX/cudacpp/gg_tt.sa/src/read_slha.cc index 055b19a779..5aa08bb503 100644 --- a/epochX/cudacpp/gg_tt.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include +#ifdef __HIPCC__ +#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +#else #include +#endif #include #include @@ -60,7 +64,11 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { From 50211c720472cfda1b44c8ab999299e083883883 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 00:18:49 +0200 Subject: [PATCH 41/96] [jt774] in gg_tt.sa check_sa.cc, replace some __CUDACC__ by MGONGPUCPP_GPUIMPL (in code added after Jorgen's work) --- .../gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; From 034822e67ea4a74d5a01816b7ffd6de0f6b0b6aa Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 00:43:46 +0200 Subject: [PATCH 42/96] [jt774] in gg_tt.sa, move back hip_runtime.h from GpuAsbtraction.h to mgOnGpuConfig.h (needed for blockDim, blockIdx, threadIdx) --- epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h | 2 -- epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif From 0b2060a30b623367bd4361594748cda6603b5157 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 00:58:30 +0200 Subject: [PATCH 43/96] [jt774] in gg_tt.sa cudacpp.mk, add -lstdc++fs when linking the gpu library and runTest.exe on HIP (otherwise again std::experimental::filesystem is not found) Note: gg_tt.sa now builds successfully for C++ and HIP on LUMI --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 3fb94ce3bc..43453cf816 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -651,8 +651,12 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) +ifneq ($(findstring hipcc,$(GPUCC)),) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +else $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +783,10 @@ $(testmain): LIBFLAGS += -lgomp endif endif +ifneq ($(findstring hipcc,$(GPUCC)),) +$(testmain): LIBFLAGS += -lstdc++fs +endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +794,7 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif # Use target gtestlibs to build only googletest From cca2658f1df15fa6da2c6ae18cb44c04912d8fcb Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 11:47:12 +0100 Subject: [PATCH 44/96] [jt744] rerun all 78 tput tests, all ok STARTED AT Thu Jan 25 07:38:23 PM CET 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Thu Jan 25 11:20:08 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Thu Jan 25 11:48:46 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Thu Jan 25 11:58:40 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Fri Jan 26 12:01:58 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Fri Jan 26 12:05:15 AM CET 2024 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 120 ++++++------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 120 ++++++------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 120 ++++++------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 120 ++++++------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 120 ++++++------- .../log_eemumu_mad_d_inl0_hrd1.txt | 108 ++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 120 ++++++------- .../log_eemumu_mad_d_inl1_hrd1.txt | 108 ++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 116 ++++++------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 116 ++++++------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 116 ++++++------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 116 ++++++------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 116 ++++++------- .../log_eemumu_mad_f_inl0_hrd1.txt | 116 ++++++------- .../log_eemumu_mad_f_inl1_hrd0.txt | 128 +++++++------- .../log_eemumu_mad_f_inl1_hrd1.txt | 124 +++++++------- .../log_eemumu_mad_m_inl0_hrd0.txt | 108 ++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 108 ++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 112 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 112 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 112 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 112 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 112 ++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 108 ++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 116 ++++++------- .../log_ggtt_mad_d_inl1_hrd1.txt | 108 ++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl0_hrd1.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl1_hrd0.txt | 132 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 132 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 120 ++++++------- .../log_ggtt_mad_m_inl0_hrd1.txt | 120 ++++++------- .../log_ggttg_mad_d_inl0_hrd0.txt | 128 +++++++------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 128 +++++++------- .../log_ggttg_mad_d_inl0_hrd1.txt | 128 +++++++------- .../log_ggttg_mad_f_inl0_hrd0.txt | 148 ++++++++-------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 148 ++++++++-------- .../log_ggttg_mad_f_inl0_hrd1.txt | 148 ++++++++-------- .../log_ggttg_mad_m_inl0_hrd0.txt | 128 +++++++------- .../log_ggttg_mad_m_inl0_hrd1.txt | 128 +++++++------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 136 +++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 136 +++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 136 +++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 136 +++++++-------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 136 +++++++-------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 136 +++++++-------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 140 +++++++-------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 140 +++++++-------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 152 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 152 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 160 +++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 152 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 152 ++++++++--------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 150 ++++++++-------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 156 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 156 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 136 +++++++-------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 136 +++++++-------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 124 +++++++------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 124 +++++++------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 124 +++++++------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 154 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 154 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 154 ++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 124 +++++++------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 124 +++++++------- .../log_gqttq_mad_d_inl0_hrd0.txt | 132 +++++++-------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 132 +++++++-------- .../log_gqttq_mad_d_inl0_hrd1.txt | 132 +++++++-------- .../log_gqttq_mad_f_inl0_hrd0.txt | 146 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 146 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd1.txt | 146 ++++++++-------- .../log_gqttq_mad_m_inl0_hrd0.txt | 124 +++++++------- .../log_gqttq_mad_m_inl0_hrd1.txt | 124 +++++++------- 78 files changed, 5067 insertions(+), 5067 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 352d1c6fba..927b1eabba 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:27:52 +DATE: 2024-01-25_23:00:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.424219e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.270852e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.136825e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.493535e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.336114e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.321735e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.828494 sec - 2,799,509,879 cycles # 2.917 GHz - 4,362,735,667 instructions # 1.56 insn per cycle - 1.170786520 seconds time elapsed +TOTAL : 0.844492 sec + 2,822,572,603 cycles # 3.011 GHz + 4,441,343,230 instructions # 1.57 insn per cycle + 1.182647833 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.137542e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.334591e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.334591e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.030733e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195877e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195877e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.926738 sec - 18,326,467,169 cycles # 3.090 GHz - 43,971,697,990 instructions # 2.40 insn per cycle - 5.940094102 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.506266 sec + 19,500,432,361 cycles # 2.995 GHz + 46,932,851,007 instructions # 2.41 insn per cycle + 6.522692471 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.696452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.222211e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.222211e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.673180e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.188384e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.188384e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.102636 sec - 12,781,292,854 cycles # 3.112 GHz - 30,998,546,187 instructions # 2.43 insn per cycle - 4.125957125 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.150845 sec + 12,810,023,668 cycles # 3.082 GHz + 31,183,348,105 instructions # 2.43 insn per cycle + 4.171384273 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.105968e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.947022e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.947022e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.081752e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.922153e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.922153e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.375754 sec - 10,081,613,505 cycles # 2.982 GHz - 19,365,068,208 instructions # 1.92 insn per cycle - 3.393379455 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.412467 sec + 10,047,988,446 cycles # 2.940 GHz + 19,479,896,421 instructions # 1.94 insn per cycle + 3.432947741 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.197053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.113568e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.113568e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.191230e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.159424e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.159424e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.253132 sec - 9,700,425,131 cycles # 2.977 GHz - 18,987,900,885 instructions # 1.96 insn per cycle - 3.272883868 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.265089 sec + 9,604,174,484 cycles # 2.937 GHz + 18,943,995,091 instructions # 1.97 insn per cycle + 3.287100859 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.888824e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.524315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.524315e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.012524e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.758193e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.758193e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.721051 sec - 8,612,012,158 cycles # 2.311 GHz - 15,727,858,115 instructions # 1.83 insn per cycle - 3.740799653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.517798 sec + 8,151,867,149 cycles # 2.314 GHz + 15,511,439,391 instructions # 1.90 insn per cycle + 3.537817962 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index c9fd7402fe..dc73944f81 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:09:46 +DATE: 2024-01-25_23:52:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563802e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504997e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504997e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.672962e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.549889e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.549889e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.276846 sec - 7,462,184,879 cycles # 2.960 GHz - 13,240,340,961 instructions # 1.77 insn per cycle - 2.578354941 seconds time elapsed +TOTAL : 2.223048 sec + 7,523,634,942 cycles # 3.040 GHz + 13,311,720,636 instructions # 1.77 insn per cycle + 2.532044795 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.060089e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.241158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.241158e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.010764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168597e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168597e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.541091 sec - 19,576,478,878 cycles # 2.990 GHz - 44,199,334,845 instructions # 2.26 insn per cycle - 6.547828573 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.821589 sec + 20,757,853,710 cycles # 3.040 GHz + 47,159,545,853 instructions # 2.27 insn per cycle + 6.829251008 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -109,20 +109,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.520389e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.958155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.958155e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.583280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.036037e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.036037e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.763580 sec - 13,999,932,501 cycles # 2.936 GHz - 31,842,828,089 instructions # 2.27 insn per cycle - 4.771019993 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.566197 sec + 14,096,396,378 cycles # 3.083 GHz + 32,025,240,520 instructions # 2.27 insn per cycle + 4.573676254 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -137,20 +137,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.885630e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.569986e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.569986e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.927660e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635525e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635525e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.963354 sec - 11,320,178,526 cycles # 2.852 GHz - 20,728,383,013 instructions # 1.83 insn per cycle - 3.970490035 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.882075 sec + 11,344,329,644 cycles # 2.918 GHz + 20,845,338,644 instructions # 1.84 insn per cycle + 3.889536582 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -158,27 +158,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.971664e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.715052e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.715052e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.057635e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.879040e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.879040e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.811146 sec - 10,942,360,524 cycles # 2.867 GHz - 20,336,264,592 instructions # 1.86 insn per cycle - 3.818008523 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.662402 sec + 10,895,943,390 cycles # 2.970 GHz + 20,302,146,711 instructions # 1.86 insn per cycle + 3.669985539 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -186,27 +186,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.699997e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.217210e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.217210e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.870482e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.510766e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.510766e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.325158 sec - 9,912,314,209 cycles # 2.289 GHz - 16,872,526,974 instructions # 1.70 insn per cycle - 4.332090544 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.977984 sec + 9,525,707,007 cycles # 2.392 GHz + 16,663,195,108 instructions # 1.75 insn per cycle + 3.985464256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -214,8 +214,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 4e10a04e19..c0050262b5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:23:09 +DATE: 2024-01-26_00:05:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.494137e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.568219e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.090512e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.496580e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589305e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.145863e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.328549 sec - 4,643,802,666 cycles # 2.984 GHz - 7,163,037,263 instructions # 1.54 insn per cycle - 1.615810184 seconds time elapsed +TOTAL : 1.312001 sec + 4,717,548,500 cycles # 3.046 GHz + 7,398,000,842 instructions # 1.57 insn per cycle + 1.605915075 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.098849e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.289504e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.289504e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.045667e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.214537e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214537e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.492604 sec - 19,420,681,078 cycles # 2.994 GHz - 44,081,202,964 instructions # 2.27 insn per cycle - 6.497991580 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.771874 sec + 20,598,272,389 cycles # 3.041 GHz + 47,038,103,422 instructions # 2.28 insn per cycle + 6.778069785 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.642683e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.149460e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.149460e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.635596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.140656e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.140656e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.592487 sec - 13,879,570,492 cycles # 3.021 GHz - 31,002,445,042 instructions # 2.23 insn per cycle - 4.597888713 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.598785 sec + 13,886,226,766 cycles # 3.016 GHz + 31,185,926,736 instructions # 2.25 insn per cycle + 4.605167621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.041509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.847095e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.847095e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.098367e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.936328e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.936328e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.835804 sec - 11,175,680,204 cycles # 2.910 GHz - 19,267,089,581 instructions # 1.72 insn per cycle - 3.841361823 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.742666 sec + 11,104,986,234 cycles # 2.963 GHz + 19,380,316,908 instructions # 1.75 insn per cycle + 3.748889817 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.097943e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.989766e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.989766e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.185227e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.148360e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.148360e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.768859 sec - 10,910,527,024 cycles # 2.891 GHz - 18,677,308,293 instructions # 1.71 insn per cycle - 3.774878679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.636571 sec + 10,725,486,467 cycles # 2.946 GHz + 18,644,027,543 instructions # 1.74 insn per cycle + 3.642813824 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.768127e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.357872e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.357872e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.992308e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.739374e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.739374e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.339198 sec - 9,800,415,487 cycles # 2.256 GHz - 15,427,741,546 instructions # 1.57 insn per cycle - 4.345173089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.918160 sec + 9,291,779,840 cycles # 2.370 GHz + 15,211,442,748 instructions # 1.64 insn per cycle + 3.924034280 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 6f403f72b4..c769e281e3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:19:52 +DATE: 2024-01-26_00:02:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.511914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.599963e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.148106e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.512464e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.620923e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.182081e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.984391 sec - 3,583,458,511 cycles # 2.950 GHz - 7,069,877,265 instructions # 1.97 insn per cycle - 1.271392022 seconds time elapsed +TOTAL : 0.962741 sec + 3,604,601,594 cycles # 3.032 GHz + 7,140,731,707 instructions # 1.98 insn per cycle + 1.248405434 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.096082e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.286319e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.286319e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.059764e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.231700e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.231700e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.151369 sec - 18,323,138,260 cycles # 2.977 GHz - 43,972,152,337 instructions # 2.40 insn per cycle - 6.156982875 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.328275 sec + 19,517,485,230 cycles # 3.082 GHz + 46,932,064,028 instructions # 2.40 insn per cycle + 6.334453671 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.603629e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.100551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.100551e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633059e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.135879e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135879e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.336645 sec - 12,767,737,304 cycles # 2.941 GHz - 30,998,256,840 instructions # 2.43 insn per cycle - 4.342127866 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.247361 sec + 12,856,431,984 cycles # 3.029 GHz + 31,187,091,315 instructions # 2.43 insn per cycle + 4.253402484 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.019068e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.818538e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.818538e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.102059e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.944006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.944006e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.519784 sec - 10,072,637,753 cycles # 2.858 GHz - 19,364,816,632 instructions # 1.92 insn per cycle - 3.525396454 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.383211 sec + 10,030,209,499 cycles # 2.961 GHz + 19,480,283,878 instructions # 1.94 insn per cycle + 3.389400574 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.119317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.001533e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.001533e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.210848e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.176162e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.176162e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.367447 sec - 9,690,886,611 cycles # 2.874 GHz - 18,986,470,372 instructions # 1.96 insn per cycle - 3.372983732 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.235982 sec + 9,574,277,596 cycles # 2.955 GHz + 18,943,319,025 instructions # 1.98 insn per cycle + 3.242098938 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.792369e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.378971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.378971e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.933692e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.640454e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.640454e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.914886 sec - 8,594,430,788 cycles # 2.193 GHz - 15,726,409,723 instructions # 1.83 insn per cycle - 3.920405262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.654789 sec + 8,148,850,269 cycles # 2.227 GHz + 15,511,167,758 instructions # 1.90 insn per cycle + 3.660729879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index d1ecb99501..442e741920 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:16:32 +DATE: 2024-01-25_23:58:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.945904e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.504449e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.028089e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.003420e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.530385e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.003006e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.911118 sec - 6,389,730,073 cycles # 2.981 GHz - 11,525,444,460 instructions # 1.80 insn per cycle - 2.200099911 seconds time elapsed +TOTAL : 1.888085 sec + 6,237,367,974 cycles # 2.946 GHz + 11,479,672,260 instructions # 1.84 insn per cycle + 2.175346114 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.103959e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.297288e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297288e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.067013e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239536e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239536e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.107673 sec - 18,355,628,587 cycles # 3.004 GHz - 43,972,892,726 instructions # 2.40 insn per cycle - 6.113444450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.287610 sec + 19,515,724,381 cycles # 3.102 GHz + 46,934,256,653 instructions # 2.40 insn per cycle + 6.293745010 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -101,20 +101,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634128e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.138204e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.138204e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.667300e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.175747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.175747e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.254868 sec - 12,786,896,134 cycles # 3.002 GHz - 30,998,062,799 instructions # 2.42 insn per cycle - 4.260594817 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.160869 sec + 12,818,829,014 cycles # 3.077 GHz + 31,182,932,996 instructions # 2.43 insn per cycle + 4.167214283 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -128,20 +128,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.027528e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.835087e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.835087e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.103000e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.940953e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.940953e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.504925 sec - 10,096,961,114 cycles # 2.877 GHz - 19,366,201,509 instructions # 1.92 insn per cycle - 3.510626038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.381858 sec + 10,006,830,004 cycles # 2.955 GHz + 19,478,708,944 instructions # 1.95 insn per cycle + 3.387652439 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -149,26 +149,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053655e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.901972e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.901972e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.228776e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.203992e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.203992e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.470906 sec - 9,748,601,702 cycles # 2.805 GHz - 18,987,159,627 instructions # 1.95 insn per cycle - 3.476589218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.213881 sec + 9,515,842,256 cycles # 2.956 GHz + 18,941,625,353 instructions # 1.99 insn per cycle + 3.220095449 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,26 +176,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.791558e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.378176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.378176e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.018322e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.773276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773276e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.917961 sec - 8,615,779,869 cycles # 2.196 GHz - 15,727,012,410 instructions # 1.83 insn per cycle - 3.924530110 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.510174 sec + 8,143,935,761 cycles # 2.317 GHz + 15,510,805,308 instructions # 1.90 insn per cycle + 3.516440454 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -203,8 +203,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 0c748b5362..7e362de6ad 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:28:26 +DATE: 2024-01-25_23:01:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.428371e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.291622e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.195095e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.479753e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.302759e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.180666e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.698284 sec - 2,831,920,965 cycles # 3.030 GHz - 4,374,282,745 instructions # 1.54 insn per cycle - 1.019007238 seconds time elapsed +TOTAL : 0.692528 sec + 2,765,261,426 cycles # 2.975 GHz + 4,347,441,638 instructions # 1.57 insn per cycle + 1.009702955 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.203126e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425801e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425801e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.112704e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304972e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.304972e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.626130 sec - 17,496,349,043 cycles # 3.107 GHz - 41,817,978,562 instructions # 2.39 insn per cycle - 5.639060633 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.053723 sec + 18,438,643,690 cycles # 3.044 GHz + 44,718,807,490 instructions # 2.43 insn per cycle + 6.067208412 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.748150e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.309667e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.309667e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.732933e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.291043e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291043e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.992952 sec - 12,464,968,903 cycles # 3.118 GHz - 30,161,347,268 instructions # 2.42 insn per cycle - 4.011343884 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.019211 sec + 12,403,068,676 cycles # 3.082 GHz + 30,106,222,055 instructions # 2.43 insn per cycle + 4.041294858 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.129918e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.998050e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.998050e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.073294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.898621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.898621e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.345656 sec - 9,985,760,194 cycles # 2.981 GHz - 19,098,128,141 instructions # 1.91 insn per cycle - 3.367255257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) +TOTAL : 3.428696 sec + 10,138,654,204 cycles # 2.954 GHz + 19,114,972,544 instructions # 1.89 insn per cycle + 3.446242769 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.214083e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.146132e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.146132e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.266572e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279723e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.279723e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.227419 sec - 9,634,388,874 cycles # 2.980 GHz - 18,745,812,999 instructions # 1.95 insn per cycle - 3.242688774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) +TOTAL : 3.167496 sec + 9,403,210,390 cycles # 2.964 GHz + 18,489,357,974 instructions # 1.97 insn per cycle + 3.185727133 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.942471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.616599e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.616599e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.411079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.590354e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.590354e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.630500 sec - 8,446,217,730 cycles # 2.323 GHz - 15,603,353,222 instructions # 1.85 insn per cycle - 3.648277451 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) +TOTAL : 3.003146 sec + 7,196,870,284 cycles # 2.392 GHz + 13,864,300,606 instructions # 1.93 insn per cycle + 3.027427459 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 03ec5636b7..db90974050 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:58:42 +DATE: 2024-01-25_23:41:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.462140e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.622718e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.171485e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.485574e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.629060e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.159335e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.680938 sec - 2,683,046,213 cycles # 2.937 GHz - 4,147,768,853 instructions # 1.55 insn per cycle - 0.974174885 seconds time elapsed +TOTAL : 0.668284 sec + 2,705,295,792 cycles # 3.008 GHz + 4,249,177,557 instructions # 1.57 insn per cycle + 0.961970052 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107305e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107305e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.450369e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.787521e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.787521e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.242929 sec - 12,671,992,752 cycles # 2.983 GHz - 32,514,493,053 instructions # 2.57 insn per cycle - 4.248758803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.729727 sec + 14,595,480,674 cycles # 3.083 GHz + 36,696,587,452 instructions # 2.51 insn per cycle + 4.736310903 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.096760e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.006731e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.006731e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.113447e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.010835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.010835e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.405789 sec - 10,268,358,622 cycles # 3.010 GHz - 24,473,723,768 instructions # 2.38 insn per cycle - 3.411753584 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.365946 sec + 10,359,930,578 cycles # 3.073 GHz + 24,752,972,702 instructions # 2.39 insn per cycle + 3.372469625 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.257873e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.333676e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.333676e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.409493e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.590679e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.590679e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.192822 sec - 9,136,252,150 cycles # 2.858 GHz - 16,922,200,539 instructions # 1.85 insn per cycle - 3.198278524 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) +TOTAL : 3.003707 sec + 8,879,004,825 cycles # 2.951 GHz + 16,955,050,761 instructions # 1.91 insn per cycle + 3.010173521 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.262662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358756e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358756e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.598956e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.029022e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.029022e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.189189 sec - 8,979,407,762 cycles # 2.811 GHz - 16,334,341,152 instructions # 1.82 insn per cycle - 3.195303169 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) +TOTAL : 2.810771 sec + 8,370,695,203 cycles # 2.973 GHz + 16,298,281,090 instructions # 1.95 insn per cycle + 2.817168029 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.031223e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.819033e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819033e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.113771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.134014e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.134014e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.500399 sec - 7,927,835,353 cycles # 2.263 GHz - 14,582,334,434 instructions # 1.84 insn per cycle - 3.505946160 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) +TOTAL : 3.368912 sec + 8,032,946,984 cycles # 2.381 GHz + 14,352,398,094 instructions # 1.79 insn per cycle + 3.375198940 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index ed59ed96d8..730795b745 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:59:12 +DATE: 2024-01-25_23:41:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.466236e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.600756e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.174175e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.486952e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.636507e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.196895e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677095 sec - 2,664,495,162 cycles # 2.925 GHz - 4,015,268,500 instructions # 1.51 insn per cycle - 0.968065572 seconds time elapsed +TOTAL : 0.663966 sec + 2,717,326,037 cycles # 3.017 GHz + 4,137,165,948 instructions # 1.52 insn per cycle + 0.963612691 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.188114e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.096436e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.096436e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.037231e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.765883e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.765883e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.282515 sec - 9,850,281,434 cycles # 2.996 GHz - 25,394,191,492 instructions # 2.58 insn per cycle - 3.288415370 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.484428 sec + 10,787,558,593 cycles # 3.093 GHz + 28,354,504,883 instructions # 2.63 insn per cycle + 3.490670595 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.427078e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.740655e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.740655e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.402256e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.621682e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.621682e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.000969 sec - 8,962,872,354 cycles # 2.983 GHz - 21,484,742,690 instructions # 2.40 insn per cycle - 3.006873984 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.008147 sec + 9,259,698,743 cycles # 3.075 GHz + 21,586,269,761 instructions # 2.33 insn per cycle + 3.014484526 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.437736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.701040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.701040e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.585263e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.966687e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.966687e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.982872 sec - 8,650,300,725 cycles # 2.896 GHz - 15,811,446,227 instructions # 1.83 insn per cycle - 2.988774005 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) +TOTAL : 2.822869 sec + 8,394,492,587 cycles # 2.968 GHz + 15,943,662,560 instructions # 1.90 insn per cycle + 2.829238696 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.506471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822748e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822748e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.835468e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.573667e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.573667e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.907315 sec - 8,443,660,993 cycles # 2.899 GHz - 15,514,495,232 instructions # 1.84 insn per cycle - 2.913215721 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) +TOTAL : 2.606917 sec + 7,795,616,577 cycles # 2.984 GHz + 15,369,554,137 instructions # 1.97 insn per cycle + 2.613116887 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.094032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.959241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.959241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.251100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274564e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.415670 sec - 7,621,535,351 cycles # 2.228 GHz - 14,285,430,897 instructions # 1.87 insn per cycle - 3.421826468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) +TOTAL : 3.195437 sec + 7,390,615,932 cycles # 2.316 GHz + 13,884,358,996 instructions # 1.88 insn per cycle + 3.201845767 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 5eeb6e403e..d2c9307113 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:28:58 +DATE: 2024-01-25_23:01:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.089717e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081644e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.286206e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.094789e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088334e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.276470e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.586552 sec - 2,420,869,304 cycles # 2.991 GHz - 3,828,073,747 instructions # 1.58 insn per cycle - 0.888297566 seconds time elapsed +TOTAL : 0.578518 sec + 2,407,118,979 cycles # 2.997 GHz + 3,725,510,179 instructions # 1.55 insn per cycle + 0.886415117 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.174500e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.392497e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.392497e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111589e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.308134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.308134e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.713477 sec - 17,800,790,292 cycles # 3.113 GHz - 43,512,722,868 instructions # 2.44 insn per cycle - 5.725545534 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.012216 sec + 18,550,637,771 cycles # 3.083 GHz + 47,045,980,285 instructions # 2.54 insn per cycle + 6.027733550 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.407559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.694429e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.694429e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328667e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.560006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.560006e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.969731 sec - 9,247,230,421 cycles # 3.108 GHz - 21,907,456,003 instructions # 2.37 insn per cycle - 2.986090115 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.058903 sec + 9,248,498,925 cycles # 3.019 GHz + 22,092,783,027 instructions # 2.39 insn per cycle + 3.077764087 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.603940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.990614e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.990614e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.639087e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085268e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085268e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.767987 sec - 8,299,418,318 cycles # 2.993 GHz - 15,591,106,391 instructions # 1.88 insn per cycle - 2.786468252 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.736763 sec + 8,158,081,223 cycles # 2.975 GHz + 15,625,425,474 instructions # 1.92 insn per cycle + 2.755087289 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.627112e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.068065e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.068065e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.770493e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.424122e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.424122e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.741024 sec - 8,236,041,235 cycles # 2.999 GHz - 15,428,889,211 instructions # 1.87 insn per cycle - 2.759001089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.620250 sec + 7,845,741,820 cycles # 2.988 GHz + 15,296,492,976 instructions # 1.95 insn per cycle + 2.642414775 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.613778e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.027225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.027225e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.780455e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.411688e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.411688e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.766843 sec - 6,634,803,505 cycles # 2.394 GHz - 12,864,482,335 instructions # 1.94 insn per cycle - 2.786287293 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.617695 sec + 6,398,515,491 cycles # 2.441 GHz + 12,623,800,106 instructions # 1.97 insn per cycle + 2.637090511 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 77f863e39f..311dfe7d07 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:10:24 +DATE: 2024-01-25_23:52:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.062157e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.459463e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.459463e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.271050e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518439e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518439e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.708612 sec - 5,739,082,619 cycles # 2.966 GHz - 10,314,110,992 instructions # 1.80 insn per cycle - 1.994013421 seconds time elapsed +TOTAL : 1.663435 sec + 5,761,082,129 cycles # 3.046 GHz + 10,284,537,473 instructions # 1.79 insn per cycle + 1.948147743 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.106417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.307176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.307176e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.077035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.262740e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.262740e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.159436 sec - 18,484,427,111 cycles # 2.999 GHz - 43,663,224,156 instructions # 2.36 insn per cycle - 6.165948155 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.304919 sec + 19,230,718,710 cycles # 3.049 GHz + 47,196,758,833 instructions # 2.45 insn per cycle + 6.311821481 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -102,27 +102,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.217277e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320268e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320268e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.274745e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.400943e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.400943e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.327491 sec - 9,999,841,208 cycles # 3.000 GHz - 23,242,551,966 instructions # 2.32 insn per cycle - 3.334092044 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.240921 sec + 10,027,883,999 cycles # 3.090 GHz + 23,431,081,239 instructions # 2.34 insn per cycle + 3.247768707 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -137,20 +137,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.386544e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570204e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570204e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.506974e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.804251e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.804251e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.123723 sec - 9,026,914,064 cycles # 2.885 GHz - 16,711,011,073 instructions # 1.85 insn per cycle - 3.130206343 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.990495 sec + 8,913,858,036 cycles # 2.975 GHz + 16,750,811,061 instructions # 1.88 insn per cycle + 2.997326374 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -165,20 +165,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.374379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.579230e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.579230e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.600687e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.056945e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.056945e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.141266 sec - 8,951,033,856 cycles # 2.844 GHz - 16,554,473,732 instructions # 1.85 insn per cycle - 3.147725016 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.895236 sec + 8,649,844,179 cycles # 2.982 GHz + 16,422,517,147 instructions # 1.90 insn per cycle + 2.902094725 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -193,20 +193,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.422417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623038e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623038e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.596837e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.005170e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.005170e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.084070 sec - 7,371,897,160 cycles # 2.386 GHz - 14,070,343,022 instructions # 1.91 insn per cycle - 3.090398779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.901194 sec + 7,177,929,467 cycles # 2.469 GHz + 13,849,422,404 instructions # 1.93 insn per cycle + 2.908162890 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -214,8 +214,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 94aeb34d20..5fd851a374 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:23:46 +DATE: 2024-01-26_00:06:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.278888e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156496e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.253231e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.306357e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.179100e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.261443e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.221248 sec - 4,164,953,848 cycles # 2.869 GHz - 6,574,190,110 instructions # 1.58 insn per cycle - 1.509194129 seconds time elapsed +TOTAL : 1.147708 sec + 4,163,766,116 cycles # 3.042 GHz + 6,626,945,716 instructions # 1.59 insn per cycle + 1.426181632 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.130054e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.340325e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.340325e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.110518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.305248e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.305248e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.276129 sec - 18,853,916,186 cycles # 3.003 GHz - 43,696,977,700 instructions # 2.32 insn per cycle - 6.281870449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.337609 sec + 19,555,432,676 cycles # 3.083 GHz + 47,227,213,648 instructions # 2.42 insn per cycle + 6.343559467 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.294770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.514019e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.514019e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.385516e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.648026e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.648026e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.444527 sec - 10,256,624,459 cycles # 2.974 GHz - 21,988,381,288 instructions # 2.14 insn per cycle - 3.450359347 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.298500 sec + 10,227,348,695 cycles # 3.096 GHz + 22,172,051,154 instructions # 2.17 insn per cycle + 3.304380352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.493233e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822049e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822049e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.622613e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085167e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085167e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.224970 sec - 9,324,398,209 cycles # 2.887 GHz - 15,502,220,245 instructions # 1.66 insn per cycle - 3.230595711 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 3.075553 sec + 9,160,418,803 cycles # 2.975 GHz + 15,535,398,337 instructions # 1.70 insn per cycle + 3.081224998 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.511667e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.890072e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.890072e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.735922e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.391720e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.391720e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.207482 sec - 9,275,437,455 cycles # 2.888 GHz - 15,144,147,371 instructions # 1.63 insn per cycle - 3.213077967 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.980114 sec + 8,899,453,850 cycles # 2.982 GHz + 15,005,353,105 instructions # 1.69 insn per cycle + 2.985888152 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.556471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.938543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.938543e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.766878e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.403077e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.403077e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.164113 sec - 7,669,667,119 cycles # 2.420 GHz - 12,572,972,519 instructions # 1.64 insn per cycle - 3.169922598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.949550 sec + 7,417,056,376 cycles # 2.511 GHz + 12,332,428,558 instructions # 1.66 insn per cycle + 2.955404265 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 62bb3359c2..b6c30db0cb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:20:27 +DATE: 2024-01-26_00:02:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.313617e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192806e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298454e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.312381e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184733e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.266952e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.858273 sec - 3,147,417,725 cycles # 2.925 GHz - 6,398,055,131 instructions # 2.03 insn per cycle - 1.135101998 seconds time elapsed +TOTAL : 0.835122 sec + 3,197,729,804 cycles # 3.031 GHz + 6,511,347,801 instructions # 2.04 insn per cycle + 1.113725741 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.125924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.336940e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336940e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111142e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.305911e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.305911e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.957055 sec - 17,830,147,444 cycles # 2.991 GHz - 43,513,458,674 instructions # 2.44 insn per cycle - 5.962276638 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.017248 sec + 18,556,545,570 cycles # 3.084 GHz + 47,048,067,347 instructions # 2.54 insn per cycle + 6.023231625 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.326147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566110e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566110e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.376972e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.665333e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.665333e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.071304 sec - 9,223,399,519 cycles # 2.999 GHz - 21,906,628,505 instructions # 2.38 insn per cycle - 3.076527254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.995990 sec + 9,272,557,506 cycles # 3.090 GHz + 22,091,542,289 instructions # 2.38 insn per cycle + 3.001967287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.495233e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.826120e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.826120e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.607052e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.055871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.055871e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.884227 sec - 8,304,746,805 cycles # 2.875 GHz - 15,590,862,530 instructions # 1.88 insn per cycle - 2.889527315 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.769502 sec + 8,191,039,582 cycles # 2.953 GHz + 15,625,047,168 instructions # 1.91 insn per cycle + 2.775572911 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.543511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.915375e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.915375e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.750343e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.394522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.394522e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.832113 sec - 8,194,558,683 cycles # 2.889 GHz - 15,428,318,410 instructions # 1.88 insn per cycle - 2.837422327 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.637756 sec + 7,859,183,641 cycles # 2.974 GHz + 15,295,681,899 instructions # 1.95 insn per cycle + 2.643542638 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.541346e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.894800e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.894800e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.768753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.394838e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.394838e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.839095 sec - 6,610,683,902 cycles # 2.326 GHz - 12,864,156,943 instructions # 1.95 insn per cycle - 2.844410626 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.627373 sec + 6,404,305,674 cycles # 2.433 GHz + 12,622,889,798 instructions # 1.97 insn per cycle + 2.633130133 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 2afd300dfd..a5f5742ed1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:17:07 +DATE: 2024-01-25_23:59:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.991991e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.144574e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.162617e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.289424e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155625e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176626e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.488342 sec - 5,031,725,760 cycles # 2.945 GHz - 9,217,170,327 instructions # 1.83 insn per cycle - 1.765546553 seconds time elapsed +TOTAL : 1.442027 sec + 5,072,798,131 cycles # 3.044 GHz + 9,207,644,948 instructions # 1.82 insn per cycle + 1.725918459 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.131966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341693e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341693e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.100030e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.293407e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.293407e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.927034 sec - 17,813,349,259 cycles # 3.004 GHz - 43,513,436,757 instructions # 2.44 insn per cycle - 5.932279783 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.076797 sec + 18,544,861,337 cycles # 3.050 GHz + 47,047,973,845 instructions # 2.54 insn per cycle + 6.082802773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -95,26 +95,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.333420e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.574664e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.574664e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.393613e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639831e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639831e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.064718 sec - 9,236,786,325 cycles # 3.009 GHz - 21,906,852,468 instructions # 2.37 insn per cycle - 3.070020479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.972935 sec + 9,226,107,763 cycles # 3.099 GHz + 22,091,523,964 instructions # 2.39 insn per cycle + 2.978809408 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -128,20 +128,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.506026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.849776e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.849776e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.634367e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.094236e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094236e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.874113 sec - 8,320,499,209 cycles # 2.891 GHz - 15,592,249,654 instructions # 1.87 insn per cycle - 2.879517495 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.744672 sec + 8,169,489,880 cycles # 2.974 GHz + 15,625,708,382 instructions # 1.91 insn per cycle + 2.750593449 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.509013e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.864492e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.864492e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.770473e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.445882e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.445882e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.870737 sec - 8,241,489,655 cycles # 2.866 GHz - 15,430,280,733 instructions # 1.87 insn per cycle - 2.876296254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.618890 sec + 7,856,839,853 cycles # 2.995 GHz + 15,295,667,717 instructions # 1.95 insn per cycle + 2.624973280 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -182,20 +182,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.501831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.838293e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838293e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.737654e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.348071e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.348071e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.887603 sec - 6,626,308,640 cycles # 2.293 GHz - 12,864,606,095 instructions # 1.94 insn per cycle - 2.893234728 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.652862 sec + 6,406,807,661 cycles # 2.412 GHz + 12,625,033,778 instructions # 1.97 insn per cycle + 2.658673017 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -203,8 +203,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index efc03ad0b2..c35cf61378 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:29:28 +DATE: 2024-01-25_23:02:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.089951e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092950e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329798e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.098241e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096988e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338544e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.581630 sec - 2,429,919,567 cycles # 3.010 GHz - 3,742,142,294 instructions # 1.54 insn per cycle - 0.889170895 seconds time elapsed +TOTAL : 0.574179 sec + 2,430,883,292 cycles # 2.998 GHz + 3,761,995,439 instructions # 1.55 insn per cycle + 0.889968607 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.231336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.478764e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.478764e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.166790e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.383742e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.383742e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.469982 sec - 16,730,831,486 cycles # 3.056 GHz - 41,270,909,044 instructions # 2.47 insn per cycle - 5.482091928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.746400 sec + 17,725,056,212 cycles # 3.082 GHz + 43,885,367,704 instructions # 2.48 insn per cycle + 5.762368432 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.472788e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.845298e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.845298e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.382827e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.685380e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.685380e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.902455 sec - 8,999,277,217 cycles # 3.096 GHz - 21,212,156,929 instructions # 2.36 insn per cycle - 2.920254424 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.997864 sec + 9,048,620,342 cycles # 3.013 GHz + 21,582,904,348 instructions # 2.39 insn per cycle + 3.019845696 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.606896e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.017656e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.017656e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.645891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.121304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.121304e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.765803 sec - 8,250,996,001 cycles # 2.978 GHz - 15,426,023,835 instructions # 1.87 insn per cycle - 2.784276443 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) +TOTAL : 2.727469 sec + 8,091,878,793 cycles # 2.961 GHz + 15,428,989,777 instructions # 1.91 insn per cycle + 2.750857146 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.659785e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.147153e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.147153e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.751076e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.395522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.395522e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.718129 sec - 8,116,789,845 cycles # 2.981 GHz - 15,238,834,705 instructions # 1.88 insn per cycle - 2.738443955 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) +TOTAL : 2.640357 sec + 7,849,064,666 cycles # 2.967 GHz + 15,086,994,011 instructions # 1.92 insn per cycle + 2.659695567 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.665676e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.125551e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.940902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.834293e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.834293e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.716332 sec - 6,606,196,349 cycles # 2.427 GHz - 12,842,263,926 instructions # 1.94 insn per cycle - 2.735947842 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) +TOTAL : 2.491118 sec + 6,165,763,719 cycles # 2.470 GHz + 12,244,016,638 instructions # 1.99 insn per cycle + 2.510253485 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052564145764E-002 -Relative difference = 1.9988585667912256e-07 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 6aac347ebc..e90a5e24b2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:59:40 +DATE: 2024-01-25_23:42:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.268443e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159820e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.242861e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.297942e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.191141e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.279372e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585151 sec - 2,376,619,364 cycles # 2.930 GHz - 3,674,969,576 instructions # 1.55 insn per cycle - 0.869620998 seconds time elapsed +TOTAL : 0.561987 sec + 2,359,900,397 cycles # 3.010 GHz + 3,659,136,851 instructions # 1.55 insn per cycle + 0.842862484 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.681014e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.190697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.190697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.483786e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.861169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.861169e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.111208 sec - 12,184,760,501 cycles # 2.960 GHz - 32,432,932,194 instructions # 2.66 insn per cycle - 4.116807187 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.663846 sec + 13,962,650,647 cycles # 2.991 GHz + 37,848,484,392 instructions # 2.71 insn per cycle + 4.670004365 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039840314887E-002 -Relative difference = 1.244813035273009e-08 +Avg ME (F77/C++) = 1.2828039414671366E-002 +Relative difference = 4.562884388571957e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.746012e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.644193e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.644193e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.866159e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.882284e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.882284e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.661942 sec - 8,002,880,266 cycles # 3.001 GHz - 18,657,307,287 instructions # 2.33 insn per cycle - 2.667590652 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.542363 sec + 7,896,880,722 cycles # 3.100 GHz + 18,602,696,675 instructions # 2.36 insn per cycle + 2.548302756 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039283704129E-002 -Relative difference = 5.583829420356249e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.839072e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690688e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.966287e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.951221e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.951221e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.575310 sec - 7,484,630,621 cycles # 2.901 GHz - 14,251,612,805 instructions # 1.90 insn per cycle - 2.581216132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) +TOTAL : 2.468293 sec + 7,420,929,664 cycles # 3.001 GHz + 14,338,774,343 instructions # 1.93 insn per cycle + 2.474190802 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053246266791E-002 +Relative difference = 2.5306003563303186e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.852469e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.787188e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.787188e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.030042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.163232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.163232e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.574795 sec - 7,331,036,803 cycles # 2.845 GHz - 13,950,107,124 instructions # 1.90 insn per cycle - 2.580411131 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) +TOTAL : 2.425778 sec + 7,309,271,422 cycles # 3.007 GHz + 13,954,124,612 instructions # 1.91 insn per cycle + 2.432103848 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.587347e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.044303e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.044303e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.883579e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.679418e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.679418e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.795609 sec - 6,537,128,178 cycles # 2.335 GHz - 13,422,893,916 instructions # 2.05 insn per cycle - 2.801293417 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) +TOTAL : 2.534153 sec + 6,266,746,932 cycles # 2.468 GHz + 13,208,183,134 instructions # 2.11 insn per cycle + 2.540179172 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052562326775E-002 -Relative difference = 1.997440588685788e-07 +Avg ME (F77/C++) = 1.2828052540498902E-002 +Relative difference = 1.980424851420537e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 320612f062..469aa8ffd2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:00:08 +DATE: 2024-01-25_23:42:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.301880e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200050e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.327102e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.304170e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209333e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342098e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577502 sec - 2,367,605,467 cycles # 2.931 GHz - 3,686,475,541 instructions # 1.56 insn per cycle - 0.865341028 seconds time elapsed +TOTAL : 0.557799 sec + 2,362,597,179 cycles # 3.024 GHz + 3,707,659,448 instructions # 1.57 insn per cycle + 0.839215126 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.231230e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.232983e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.232983e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.132330e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.983314e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.983314e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.189007 sec - 9,438,833,361 cycles # 2.955 GHz - 25,269,338,727 instructions # 2.68 insn per cycle - 3.194786598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.300471 sec + 10,112,740,962 cycles # 3.060 GHz + 28,398,804,011 instructions # 2.81 insn per cycle + 3.306398800 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039838495897E-002 -Relative difference = 1.2589928273811243e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.111513e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.795769e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.795769e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.109998e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.743632e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.743632e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.388364 sec - 7,189,276,734 cycles # 3.004 GHz - 16,868,514,931 instructions # 2.35 insn per cycle - 2.393974942 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.369463 sec + 7,338,726,391 cycles # 3.091 GHz + 16,786,306,106 instructions # 2.29 insn per cycle + 2.375667430 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.018803e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.200647e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.200647e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.103514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.383052e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.383052e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.444205 sec - 7,132,947,258 cycles # 2.912 GHz - 13,616,405,803 instructions # 1.91 insn per cycle - 2.449801989 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) +TOTAL : 2.379197 sec + 7,096,164,424 cycles # 2.976 GHz + 13,728,890,269 instructions # 1.93 insn per cycle + 2.385413338 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.049548e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.286960e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.286960e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.095407e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.429923e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.429923e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.421039 sec - 7,049,543,144 cycles # 2.906 GHz - 13,426,454,258 instructions # 1.90 insn per cycle - 2.426713774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) +TOTAL : 2.384377 sec + 7,073,164,896 cycles # 2.961 GHz + 13,460,571,389 instructions # 1.90 insn per cycle + 2.390322761 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.688665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.273438e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.273438e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.020353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.067711e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.067711e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.707645 sec - 6,353,817,019 cycles # 2.343 GHz - 13,154,875,496 instructions # 2.07 insn per cycle - 2.713469152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) +TOTAL : 2.437257 sec + 6,059,792,230 cycles # 2.481 GHz + 12,910,342,390 instructions # 2.13 insn per cycle + 2.443163164 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052536860923E-002 -Relative difference = 1.977588895209662e-07 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 675f8002f0..ca059fc445 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:29:57 +DATE: 2024-01-25_23:02:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.421403e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.263390e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.120736e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.466288e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.337541e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.171849e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699628 sec - 2,791,107,487 cycles # 2.987 GHz - 4,371,723,888 instructions # 1.57 insn per cycle - 1.020217670 seconds time elapsed +TOTAL : 0.690524 sec + 2,792,013,447 cycles # 3.009 GHz + 4,251,755,468 instructions # 1.52 insn per cycle + 1.010200043 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.110744e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.296406e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.296406e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051297e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219822e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219822e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.059605 sec - 18,785,537,380 cycles # 3.098 GHz - 44,223,451,515 instructions # 2.35 insn per cycle - 6.072025963 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.376369 sec + 19,701,785,180 cycles # 3.087 GHz + 46,970,070,471 instructions # 2.38 insn per cycle + 6.391151099 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.746395e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.315536e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.315536e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.700597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.241163e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241163e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.998774 sec - 12,365,852,528 cycles # 3.089 GHz - 30,918,818,290 instructions # 2.50 insn per cycle - 4.019870048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.092874 sec + 12,523,019,197 cycles # 3.056 GHz + 30,922,016,203 instructions # 2.47 insn per cycle + 4.113499563 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.850310e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.850310e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.026909e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.810880e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.810880e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.473353 sec - 10,119,124,866 cycles # 2.909 GHz - 19,374,205,910 instructions # 1.91 insn per cycle - 3.489755488 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) +TOTAL : 3.497994 sec + 10,190,173,371 cycles # 2.908 GHz + 19,546,489,451 instructions # 1.92 insn per cycle + 3.518050330 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.189661e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.096081e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.096081e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.184526e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.112783e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.112783e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.261039 sec - 9,715,236,596 cycles # 2.974 GHz - 18,955,064,271 instructions # 1.95 insn per cycle - 3.284788216 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) +TOTAL : 3.268706 sec + 9,701,390,537 cycles # 2.963 GHz + 18,858,558,794 instructions # 1.94 insn per cycle + 3.287817502 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.871346e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.512373e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.512373e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.037405e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.812464e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.812464e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.767041 sec - 8,387,450,309 cycles # 2.224 GHz - 15,057,184,265 instructions # 1.80 insn per cycle - 3.785820964 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) +TOTAL : 3.479409 sec + 8,096,936,975 cycles # 2.323 GHz + 14,812,944,511 instructions # 1.83 insn per cycle + 3.497975564 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 486522e284..4f31d9a367 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:30:31 +DATE: 2024-01-25_23:03:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.608833e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.666599e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.153828e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.447758e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281551e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.150988e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.689454 sec - 2,790,185,844 cycles # 3.012 GHz - 4,322,858,919 instructions # 1.55 insn per cycle - 1.008572761 seconds time elapsed +TOTAL : 0.687736 sec + 2,747,370,286 cycles # 2.982 GHz + 4,268,924,942 instructions # 1.55 insn per cycle + 1.008068709 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.152239e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354641e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.354641e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.125377e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.320065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.320065e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.858273 sec - 18,113,057,840 cycles # 3.089 GHz - 42,472,824,521 instructions # 2.34 insn per cycle - 5.871069159 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.983111 sec + 18,512,223,700 cycles # 3.092 GHz + 44,592,936,775 instructions # 2.41 insn per cycle + 5.995435786 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.781270e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.374250e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.374250e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.759698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.343343e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.343343e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.930434 sec - 12,144,033,984 cycles # 3.086 GHz - 30,226,379,703 instructions # 2.49 insn per cycle - 3.949543769 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.967957 sec + 12,217,647,849 cycles # 3.075 GHz + 30,216,214,748 instructions # 2.47 insn per cycle + 3.985701544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.103753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.937318e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.937318e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.022777e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.815435e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.815435e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.379502 sec - 10,060,863,868 cycles # 2.972 GHz - 19,256,812,816 instructions # 1.91 insn per cycle - 3.401404400 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) +TOTAL : 3.505034 sec + 10,165,427,376 cycles # 2.895 GHz + 19,036,303,321 instructions # 1.87 insn per cycle + 3.526699091 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.196753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.119301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.119301e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.205707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.176082e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.176082e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.254342 sec - 9,672,428,613 cycles # 2.968 GHz - 18,746,035,376 instructions # 1.94 insn per cycle - 3.271198287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) +TOTAL : 3.241628 sec + 9,598,562,455 cycles # 2.956 GHz + 18,451,811,313 instructions # 1.92 insn per cycle + 3.263864774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.930945e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.613686e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.613686e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.352591e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.472374e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.472374e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.660732 sec - 8,282,994,053 cycles # 2.260 GHz - 14,980,698,691 instructions # 1.81 insn per cycle - 3.677125812 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) +TOTAL : 3.070635 sec + 7,200,806,955 cycles # 2.341 GHz + 13,242,495,009 instructions # 1.84 insn per cycle + 3.093967831 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 8af0b3625a..c48e4a575b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:31:04 +DATE: 2024-01-25_23:03:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.273933e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156544e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270780e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.029909e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136710e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272780e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525661 sec - 2,285,041,869 cycles # 2.996 GHz - 3,259,226,119 instructions # 1.43 insn per cycle - 0.837415991 seconds time elapsed +TOTAL : 0.530683 sec + 2,290,171,294 cycles # 2.993 GHz + 3,254,195,766 instructions # 1.42 insn per cycle + 0.851756610 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.147376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.208578e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.208578e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.196291e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.261557e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.261557e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.988043 sec - 15,215,107,476 cycles # 3.051 GHz - 38,378,066,668 instructions # 2.52 insn per cycle - 4.996465245 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.882033 sec + 14,995,294,473 cycles # 3.069 GHz + 38,723,990,063 instructions # 2.58 insn per cycle + 4.896002071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.686108e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.886642e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.886642e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.650142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858052e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.950305 sec - 9,139,887,426 cycles # 3.093 GHz - 24,578,745,754 instructions # 2.69 insn per cycle - 2.962852368 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.980112 sec + 8,977,221,889 cycles # 3.014 GHz + 24,433,177,409 instructions # 2.72 insn per cycle + 2.999214008 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.966833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.492744e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.492744e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.863210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.372035e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.372035e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.861721 sec - 5,467,401,485 cycles # 2.927 GHz - 11,252,306,729 instructions # 2.06 insn per cycle - 1.878450325 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 1.893453 sec + 5,544,307,869 cycles # 2.920 GHz + 11,562,341,019 instructions # 2.09 insn per cycle + 1.911685121 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.607385e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.251361e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.251361e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.821801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.520956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.520956e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.691034 sec - 4,950,324,717 cycles # 2.917 GHz - 10,558,681,133 instructions # 2.13 insn per cycle - 1.704227686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.645226 sec + 4,816,104,732 cycles # 2.920 GHz + 10,339,259,007 instructions # 2.15 insn per cycle + 1.663772123 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.997999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.223536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.223536e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.484238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.775134e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.775134e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.729108 sec - 5,392,071,178 cycles # 1.972 GHz - 7,793,346,651 instructions # 1.45 insn per cycle - 2.742252965 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.443398 sec + 4,951,976,924 cycles # 2.022 GHz + 7,554,721,580 instructions # 1.53 insn per cycle + 2.461242743 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index abc9cb4db6..86ee3f4362 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:10:57 +DATE: 2024-01-25_23:53:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.408166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.841952e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.841952e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.509605e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.892994e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.892994e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.827206 sec - 3,081,034,031 cycles # 2.876 GHz - 4,780,745,395 instructions # 1.55 insn per cycle - 1.128942393 seconds time elapsed +TOTAL : 0.806428 sec + 3,162,222,779 cycles # 3.017 GHz + 4,862,998,171 instructions # 1.54 insn per cycle + 1.106788590 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.108776e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.169543e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.169543e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.170652e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.233557e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.233557e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.156626 sec - 15,517,564,279 cycles # 3.006 GHz - 38,434,109,341 instructions # 2.48 insn per cycle - 5.163690036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.012783 sec + 15,309,924,583 cycles # 3.051 GHz + 38,782,435,884 instructions # 2.53 insn per cycle + 5.020172650 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -102,27 +102,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.545882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.598372e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.795378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795378e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.141184 sec - 9,467,320,162 cycles # 3.008 GHz - 24,761,393,280 instructions # 2.62 insn per cycle - 3.148330326 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.097874 sec + 9,295,060,871 cycles # 2.994 GHz + 24,612,123,964 instructions # 2.65 insn per cycle + 3.105584985 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -137,20 +137,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.651058e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.129257e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.129257e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.795739e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.286479e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.286479e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.039696 sec - 5,802,587,355 cycles # 2.836 GHz - 11,538,014,597 instructions # 1.99 insn per cycle - 2.046642189 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 1.992875 sec + 5,864,013,930 cycles # 2.933 GHz + 11,848,692,419 instructions # 2.02 insn per cycle + 2.000318144 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -165,20 +165,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.265451e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.861082e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.861082e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.666318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.334357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.334357e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.859965 sec - 5,312,241,459 cycles # 2.848 GHz - 10,843,960,204 instructions # 2.04 insn per cycle - 1.867027334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.756250 sec + 5,157,360,305 cycles # 2.926 GHz + 10,625,342,990 instructions # 2.06 insn per cycle + 1.763857171 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -193,20 +193,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.763993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.974534e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.974534e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.395537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.676486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.676486e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.977042 sec - 5,732,583,120 cycles # 1.922 GHz - 8,038,189,975 instructions # 1.40 insn per cycle - 2.984163935 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.574747 sec + 5,305,772,734 cycles # 2.057 GHz + 7,798,848,434 instructions # 1.47 insn per cycle + 2.582142171 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 44f3cc8c52..8c7873c180 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:24:19 +DATE: 2024-01-26_00:06:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.501067e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156482e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276049e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.579482e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159880e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276260e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.626850 sec - 2,536,021,563 cycles # 2.961 GHz - 3,670,883,227 instructions # 1.45 insn per cycle - 0.915863654 seconds time elapsed +TOTAL : 0.612727 sec + 2,531,994,631 cycles # 3.010 GHz + 3,691,767,465 instructions # 1.46 insn per cycle + 0.900330496 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.139594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.203265e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.203265e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.169589e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.235370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.235370e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.067424 sec - 15,399,016,558 cycles # 3.036 GHz - 38,390,017,774 instructions # 2.49 insn per cycle - 5.073191640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.000244 sec + 15,161,594,214 cycles # 3.030 GHz + 38,738,913,418 instructions # 2.56 insn per cycle + 5.006606783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.625228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.825269e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.825269e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.765286e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.975710e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.975710e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.057820 sec - 9,317,797,118 cycles # 3.043 GHz - 24,577,704,184 instructions # 2.64 insn per cycle - 3.063889144 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.947798 sec + 9,138,741,572 cycles # 3.095 GHz + 24,428,627,016 instructions # 2.67 insn per cycle + 2.953976947 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.671770e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.193648e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.193648e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.856433e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.373968e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.373968e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.021111 sec - 5,663,540,101 cycles # 2.803 GHz - 11,235,902,290 instructions # 1.98 insn per cycle - 2.027218591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 1.953801 sec + 5,726,373,065 cycles # 2.924 GHz + 11,543,837,156 instructions # 2.02 insn per cycle + 1.960034141 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.440843e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.078509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.078509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.721784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.406724e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.406724e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.794383 sec - 5,172,944,461 cycles # 2.875 GHz - 10,505,659,530 instructions # 2.03 insn per cycle - 1.800280603 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.724743 sec + 5,007,833,498 cycles # 2.896 GHz + 10,287,819,259 instructions # 2.05 insn per cycle + 1.730885282 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.980860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.211163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.211163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.285189e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.557330e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.557330e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.800543 sec - 5,575,018,302 cycles # 1.987 GHz - 7,742,183,831 instructions # 1.39 insn per cycle - 2.806534120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.612237 sec + 5,141,607,567 cycles # 1.965 GHz + 7,503,456,514 instructions # 1.46 insn per cycle + 2.618574526 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index af2440539e..26275483e7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:20:58 +DATE: 2024-01-26_00:03:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.564058e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155046e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272805e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.572661e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157522e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274817e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.562322 sec - 2,282,427,973 cycles # 2.881 GHz - 3,556,665,675 instructions # 1.56 insn per cycle - 0.849543273 seconds time elapsed +TOTAL : 0.553883 sec + 2,339,299,389 cycles # 2.989 GHz + 3,658,021,436 instructions # 1.56 insn per cycle + 0.840596463 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.099614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.160783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.160783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.215810e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.281977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.281977e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.101479 sec - 15,194,748,744 cycles # 2.976 GHz - 38,374,370,448 instructions # 2.53 insn per cycle - 5.107016878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.837164 sec + 14,973,378,568 cycles # 3.093 GHz + 38,722,169,138 instructions # 2.59 insn per cycle + 4.843282809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.574526e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.771653e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.771653e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.746637e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.962623e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.962623e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.040842 sec - 9,128,651,326 cycles # 2.997 GHz - 24,578,551,986 instructions # 2.69 insn per cycle - 3.046498874 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.904163 sec + 8,949,658,023 cycles # 3.076 GHz + 24,428,241,097 instructions # 2.73 insn per cycle + 2.910306365 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.714772e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.221583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.221583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.861320e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.368611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.368611e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.940536 sec - 5,479,977,577 cycles # 2.817 GHz - 11,251,107,887 instructions # 2.05 insn per cycle - 1.946094154 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 1.893700 sec + 5,539,396,489 cycles # 2.918 GHz + 11,561,772,559 instructions # 2.09 insn per cycle + 1.899998499 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.393849e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.021326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.021326e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.785716e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.473840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.473840e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.745575 sec - 4,952,345,278 cycles # 2.829 GHz - 10,558,643,656 instructions # 2.13 insn per cycle - 1.751295687 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.648994 sec + 4,833,459,743 cycles # 2.922 GHz + 10,338,331,915 instructions # 2.14 insn per cycle + 1.655414957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.913352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.137894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.137894e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.483832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.775524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.775524e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.786489 sec - 5,383,532,315 cycles # 1.930 GHz - 7,793,484,712 instructions # 1.45 insn per cycle - 2.792074852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.443598 sec + 4,947,694,251 cycles # 2.021 GHz + 7,552,954,202 instructions # 1.53 insn per cycle + 2.449709143 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e2193a8ad7..e47f2f66a0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:17:39 +DATE: 2024-01-26_00:00:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.736010e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153457e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268173e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.896900e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159088e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275420e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.714625 sec - 2,774,816,362 cycles # 2.935 GHz - 4,339,252,047 instructions # 1.56 insn per cycle - 1.002590963 seconds time elapsed +TOTAL : 0.701788 sec + 2,806,947,514 cycles # 3.016 GHz + 4,362,797,301 instructions # 1.55 insn per cycle + 0.987803789 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.123090e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.185367e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.185367e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.183367e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247112e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.045686 sec - 15,189,366,713 cycles # 3.007 GHz - 38,373,841,814 instructions # 2.53 insn per cycle - 5.051730938 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.907373 sec + 14,988,749,160 cycles # 3.051 GHz + 38,722,833,257 instructions # 2.58 insn per cycle + 4.913467832 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -95,26 +95,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.586999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.782954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.782954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.768588e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.978375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.978375e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.030197 sec - 9,119,166,751 cycles # 3.005 GHz - 24,577,924,210 instructions # 2.70 insn per cycle - 3.035979194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.886514 sec + 8,955,670,287 cycles # 3.097 GHz + 24,428,197,131 instructions # 2.73 insn per cycle + 2.892532610 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -128,20 +128,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.732847e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.231342e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.231342e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.846816e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.353978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.353978e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.935219 sec - 5,468,637,644 cycles # 2.819 GHz - 11,251,012,735 instructions # 2.06 insn per cycle - 1.941019031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 1.898303 sec + 5,531,164,928 cycles # 2.907 GHz + 11,561,301,371 instructions # 2.09 insn per cycle + 1.904555752 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.292655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897577e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.897577e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.795208e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.491804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.491804e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.772120 sec - 4,969,624,681 cycles # 2.796 GHz - 10,556,732,876 instructions # 2.12 insn per cycle - 1.777837624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.648055 sec + 4,815,642,876 cycles # 2.913 GHz + 10,338,431,439 instructions # 2.15 insn per cycle + 1.654388782 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -182,20 +182,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.866239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.092519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.092519e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.486209e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.779482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.779482e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.846913 sec - 5,391,039,174 cycles # 1.904 GHz - 7,794,494,031 instructions # 1.45 insn per cycle - 2.856378481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.442383 sec + 4,948,684,588 cycles # 2.023 GHz + 7,553,901,102 instructions # 1.53 insn per cycle + 2.448546666 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 977d853d64..bf7c906eee 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:31:31 +DATE: 2024-01-25_23:04:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.446963e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163595e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279925e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.041566e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.145424e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.281266e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.520650 sec - 2,264,469,596 cycles # 3.001 GHz - 3,268,925,935 instructions # 1.44 insn per cycle - 0.824307613 seconds time elapsed +TOTAL : 0.533190 sec + 2,275,159,250 cycles # 2.956 GHz + 3,198,582,493 instructions # 1.41 insn per cycle + 0.846238143 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.178244e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.243144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.243144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.250213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320060e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320060e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.920762 sec - 15,120,754,616 cycles # 3.070 GHz - 40,101,358,095 instructions # 2.65 insn per cycle - 4.928885973 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.764896 sec + 14,707,028,108 cycles # 3.083 GHz + 39,543,998,084 instructions # 2.69 insn per cycle + 4.779287435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.908591e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.133379e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.133379e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.824643e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049537e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049537e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.787534 sec - 8,670,203,341 cycles # 3.104 GHz - 23,670,756,212 instructions # 2.73 insn per cycle - 2.801209294 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.847845 sec + 8,591,903,714 cycles # 3.011 GHz + 23,575,874,645 instructions # 2.74 insn per cycle + 2.868458782 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.339479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.760864e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.760864e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.388113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.817928e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.817928e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.067737 sec - 6,095,582,988 cycles # 2.940 GHz - 13,061,573,034 instructions # 2.14 insn per cycle - 2.079447172 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) +TOTAL : 2.052030 sec + 5,980,210,912 cycles # 2.906 GHz + 13,193,706,510 instructions # 2.21 insn per cycle + 2.072456018 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.617733e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.075014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.075014e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.853930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.365435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.365435e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.970361 sec - 5,800,772,939 cycles # 2.935 GHz - 12,319,856,699 instructions # 2.12 insn per cycle - 1.986108854 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) +TOTAL : 1.897340 sec + 5,529,135,380 cycles # 2.906 GHz + 12,102,256,477 instructions # 2.19 insn per cycle + 1.916025222 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.793878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.997441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.997441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.111951e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.359090e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.359090e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.868253 sec - 5,826,829,144 cycles # 2.028 GHz - 9,602,061,917 instructions # 1.65 insn per cycle - 2.882159739 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) +TOTAL : 2.655651 sec + 5,371,905,800 cycles # 2.018 GHz + 9,380,836,259 instructions # 1.75 insn per cycle + 2.675871938 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index eaf53289b1..b9120fbb5d 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:00:34 +DATE: 2024-01-25_23:43:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.552166e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156300e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271682e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.554592e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155746e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271514e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.524921 sec - 2,220,579,587 cycles # 2.921 GHz - 3,192,850,808 instructions # 1.44 insn per cycle - 0.818936986 seconds time elapsed +TOTAL : 0.519357 sec + 2,253,363,966 cycles # 3.002 GHz + 3,205,544,962 instructions # 1.42 insn per cycle + 0.810112146 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.433349e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.518013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.518013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387914e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.465509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.465509e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.419477 sec - 13,028,620,235 cycles # 2.950 GHz - 34,390,570,196 instructions # 2.64 insn per cycle - 4.425198715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.495702 sec + 13,904,869,897 cycles # 3.089 GHz + 35,848,783,358 instructions # 2.58 insn per cycle + 4.502182318 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.068689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.210680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.210680e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.096318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.348620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.348620e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.525285 sec - 10,592,983,737 cycles # 3.000 GHz - 24,007,204,141 instructions # 2.27 insn per cycle - 3.531288478 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.664466 sec + 8,202,926,102 cycles # 3.072 GHz + 21,906,089,636 instructions # 2.67 insn per cycle + 2.670916957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.591275e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.918287e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.918287e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.559059e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.026865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.026865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.392081 sec - 6,643,133,250 cycles # 2.771 GHz - 12,401,445,774 instructions # 1.87 insn per cycle - 2.398243451 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) +TOTAL : 1.993840 sec + 5,528,651,014 cycles # 2.765 GHz + 12,074,880,280 instructions # 2.18 insn per cycle + 2.000267421 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.018329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.394522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.394522e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.391303e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.001905e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.001905e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.195662 sec - 6,234,944,949 cycles # 2.833 GHz - 11,573,018,455 instructions # 1.86 insn per cycle - 2.201415670 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) +TOTAL : 1.746221 sec + 5,115,562,208 cycles # 2.921 GHz + 11,141,356,804 instructions # 2.18 insn per cycle + 1.752659105 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.926324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.149997e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.149997e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.660691e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.975604e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.975604e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.778213 sec - 5,362,353,364 cycles # 1.927 GHz - 9,296,610,904 instructions # 1.73 insn per cycle - 2.784120643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) +TOTAL : 2.354649 sec + 4,796,247,367 cycles # 2.033 GHz + 8,840,506,379 instructions # 1.84 insn per cycle + 2.361069291 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 223a6bbd07..78525f174f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:01:02 +DATE: 2024-01-25_23:43:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.545579e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156991e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274409e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.552254e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157318e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273752e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.524046 sec - 2,220,143,747 cycles # 2.921 GHz - 3,161,876,071 instructions # 1.42 insn per cycle - 0.819384909 seconds time elapsed +TOTAL : 0.522178 sec + 2,237,011,467 cycles # 2.943 GHz + 3,197,331,844 instructions # 1.43 insn per cycle + 0.817931849 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.624216e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.719163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.719163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.662142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.760003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.760003e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.103081 sec - 12,357,400,963 cycles # 3.008 GHz - 35,038,305,221 instructions # 2.84 insn per cycle - 4.108989255 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.044119 sec + 12,507,685,859 cycles # 3.089 GHz + 35,729,677,729 instructions # 2.86 insn per cycle + 4.050458069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.981089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.114332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.114332e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.223471e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.492032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.492032e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.624381 sec - 10,699,873,531 cycles # 2.953 GHz - 23,087,357,159 instructions # 2.16 insn per cycle - 3.630253133 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.586847 sec + 8,038,523,807 cycles # 3.101 GHz + 21,259,861,765 instructions # 2.64 insn per cycle + 2.593509811 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.093491e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.483832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.483832e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.924128e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.452749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.452749e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.165104 sec - 6,157,585,167 cycles # 2.837 GHz - 11,956,600,181 instructions # 1.94 insn per cycle - 2.170909922 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) +TOTAL : 1.876172 sec + 5,308,473,428 cycles # 2.821 GHz + 11,405,363,581 instructions # 2.15 insn per cycle + 1.882750968 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.214844e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.619393e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.619393e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.605702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.254707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.254707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.116517 sec - 6,017,617,016 cycles # 2.837 GHz - 11,127,934,643 instructions # 1.85 insn per cycle - 2.122364748 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) +TOTAL : 1.691812 sec + 4,996,879,417 cycles # 2.944 GHz + 10,598,375,872 instructions # 2.12 insn per cycle + 1.698475640 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.073709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.313809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.313809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.756890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.090329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.090329e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.680169 sec - 5,188,375,015 cycles # 1.932 GHz - 9,020,642,676 instructions # 1.74 insn per cycle - 2.686120218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) +TOTAL : 2.313369 sec + 4,723,638,871 cycles # 2.039 GHz + 8,568,244,906 instructions # 1.81 insn per cycle + 2.319888543 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 5b41a4d066..b9b0cde3c0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:31:58 +DATE: 2024-01-25_23:04:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.016432e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.646338e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.972972e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.239567e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.582837e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.963220e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480351 sec - 2,093,260,903 cycles # 2.971 GHz - 2,988,705,515 instructions # 1.43 insn per cycle - 0.774971081 seconds time elapsed +TOTAL : 0.478940 sec + 2,112,985,286 cycles # 2.999 GHz + 2,978,090,529 instructions # 1.41 insn per cycle + 0.778737361 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.337300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.415066e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.415066e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.379568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.458926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.458926e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.572712 sec - 14,005,022,680 cycles # 3.059 GHz - 38,340,620,868 instructions # 2.74 insn per cycle - 4.581007114 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.490589 sec + 13,909,019,522 cycles # 3.094 GHz + 37,078,836,915 instructions # 2.67 insn per cycle + 4.499210329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.261983e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.699946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.699946e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.322182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.773170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.773170e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.076354 sec - 6,472,095,043 cycles # 3.109 GHz - 15,815,873,405 instructions # 2.44 insn per cycle - 2.093110720 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.053265 sec + 6,163,905,946 cycles # 2.994 GHz + 15,211,835,383 instructions # 2.47 insn per cycle + 2.068387035 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.625847e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106546e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.106546e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.173558 sec - 3,465,806,997 cycles # 2.939 GHz - 7,594,629,416 instructions # 2.19 insn per cycle - 1.191325878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.346993e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.070180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.070180e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.207020 sec + 3,445,603,929 cycles # 2.841 GHz + 7,715,625,143 instructions # 2.24 insn per cycle + 1.223685202 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.036192e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205283e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205283e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.095666 sec - 3,249,594,094 cycles # 2.950 GHz - 7,203,151,855 instructions # 2.22 insn per cycle - 1.112919323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.048691e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.225026e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.225026e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.085664 sec + 3,179,509,142 cycles # 2.914 GHz + 7,110,367,783 instructions # 2.24 insn per cycle + 1.099965100 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.709924e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.585781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.585781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.789161e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.697174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.697174e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.443704 sec - 3,056,175,378 cycles # 2.109 GHz - 5,834,850,495 instructions # 1.91 insn per cycle - 1.456542629 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.430626 sec + 2,983,504,587 cycles # 2.078 GHz + 5,763,932,231 instructions # 1.93 insn per cycle + 1.444194360 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index e2569bda32..6981bfe44c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:11:25 +DATE: 2024-01-25_23:53:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.002389e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.469916e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.469916e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.095576e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.509874e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.509874e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.673572 sec - 2,669,669,937 cycles # 2.942 GHz - 4,126,698,700 instructions # 1.55 insn per cycle - 0.966897204 seconds time elapsed +TOTAL : 0.662144 sec + 2,681,924,672 cycles # 3.020 GHz + 4,117,977,384 instructions # 1.54 insn per cycle + 0.946162262 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.271859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.346202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.346202e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.346899e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.744076 sec - 14,203,194,064 cycles # 2.991 GHz - 38,386,589,589 instructions # 2.70 insn per cycle - 4.750636160 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.594846 sec + 14,071,961,860 cycles # 3.059 GHz + 37,120,495,552 instructions # 2.64 insn per cycle + 4.601599482 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -102,27 +102,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.023528e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.430819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.430819e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.433551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.901399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.901399e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.219353 sec - 6,668,132,532 cycles # 2.997 GHz - 16,095,906,246 instructions # 2.41 insn per cycle - 2.225914329 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.060355 sec + 6,363,040,481 cycles # 3.080 GHz + 15,491,502,720 instructions # 2.43 insn per cycle + 2.067385124 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -130,27 +130,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.036698e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035040e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.291419 sec - 3,685,819,556 cycles # 2.841 GHz - 7,831,220,798 instructions # 2.12 insn per cycle - 1.298093181 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.463492e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.087540e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.087540e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.236762 sec + 3,641,436,941 cycles # 2.930 GHz + 7,953,430,490 instructions # 2.18 insn per cycle + 1.243962608 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -158,27 +158,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.776675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132413e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.132413e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.202817 sec - 3,439,533,821 cycles # 2.846 GHz - 7,440,100,221 instructions # 2.16 insn per cycle - 1.209376262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.030058e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201172e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201172e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.146983 sec + 3,383,857,761 cycles # 2.935 GHz + 7,347,779,825 instructions # 2.17 insn per cycle + 1.154126862 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -186,27 +186,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.099551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.865069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.865069e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.701950e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.600616e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.600616e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.609144 sec - 3,270,738,224 cycles # 2.025 GHz - 6,088,987,367 instructions # 1.86 insn per cycle - 1.615635336 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.492167 sec + 3,194,827,132 cycles # 2.133 GHz + 6,021,969,878 instructions # 1.88 insn per cycle + 1.499129213 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -214,8 +214,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 1279e79222..b4d637a0a9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:24:46 +DATE: 2024-01-26_00:07:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.310797e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.615494e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.945034e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.419910e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.637559e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958244e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.567939 sec - 2,336,393,305 cycles # 2.964 GHz - 3,415,377,807 instructions # 1.46 insn per cycle - 0.847416380 seconds time elapsed +TOTAL : 0.560146 sec + 2,374,838,659 cycles # 3.002 GHz + 3,427,021,490 instructions # 1.44 insn per cycle + 0.848554044 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.285858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.728147 sec - 14,182,471,733 cycles # 2.997 GHz - 38,371,003,858 instructions # 2.71 insn per cycle - 4.733810751 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.365915e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.444126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.444126e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.568004 sec + 14,161,375,602 cycles # 3.097 GHz + 37,106,664,478 instructions # 2.62 insn per cycle + 4.574110970 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.888028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.282207e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.282207e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.288230 sec - 6,645,756,806 cycles # 2.898 GHz - 15,828,500,466 instructions # 2.38 insn per cycle - 2.293948357 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.461790e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.938444e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.938444e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 +TOTAL : 2.056520 sec + 6,336,772,154 cycles # 3.074 GHz + 15,224,230,200 instructions # 2.40 insn per cycle + 2.062537942 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.344568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.074101e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.074101e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.613041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107028e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107028e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.263326 sec - 3,638,417,018 cycles # 2.869 GHz - 7,579,208,332 instructions # 2.08 insn per cycle - 1.269045825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +TOTAL : 1.230312 sec + 3,608,595,652 cycles # 2.921 GHz + 7,698,744,843 instructions # 2.13 insn per cycle + 1.236387039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.006852e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.170884e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.170884e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.053770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.232215e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.232215e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.182767 sec - 3,425,925,806 cycles # 2.885 GHz - 7,153,954,116 instructions # 2.09 insn per cycle - 1.188401535 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +TOTAL : 1.133298 sec + 3,342,033,785 cycles # 2.936 GHz + 7,059,045,782 instructions # 2.11 insn per cycle + 1.139229993 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.384551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.223950e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.223950e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.847509e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.768714e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.768714e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.561958 sec - 3,241,666,935 cycles # 2.069 GHz - 5,785,669,906 instructions # 1.78 insn per cycle - 1.567670740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.474424 sec + 3,144,945,594 cycles # 2.126 GHz + 5,713,078,392 instructions # 1.82 insn per cycle + 1.480358562 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index c75f243f6b..55dee8f4ac 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:21:25 +DATE: 2024-01-26_00:03:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.425944e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.638582e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958027e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.403572e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634845e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.961526e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.512465 sec - 2,128,740,635 cycles # 2.913 GHz - 3,349,486,673 instructions # 1.57 insn per cycle - 0.789824764 seconds time elapsed +TOTAL : 0.503705 sec + 2,164,758,906 cycles # 2.993 GHz + 3,340,655,205 instructions # 1.54 insn per cycle + 0.781352769 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.258814e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.334265e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.334265e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.387687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.466623e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.466623e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.729033 sec - 14,008,183,941 cycles # 2.959 GHz - 38,341,321,376 instructions # 2.74 insn per cycle - 4.734430894 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.477648 sec + 13,882,329,105 cycles # 3.098 GHz + 37,077,716,863 instructions # 2.67 insn per cycle + 4.483747727 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.915242e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.303544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.303544e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.497536e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.975863e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.975863e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.218605 sec - 6,466,211,293 cycles # 2.909 GHz - 15,815,641,765 instructions # 2.45 insn per cycle - 2.223961450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.989964 sec + 6,164,501,349 cycles # 3.090 GHz + 15,212,436,330 instructions # 2.47 insn per cycle + 1.996006252 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.288130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.067419e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.067419e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.215145 sec - 3,455,872,602 cycles # 2.833 GHz - 7,593,852,686 instructions # 2.20 insn per cycle - 1.220604147 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.422535e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.084358e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.084358e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.197986 sec + 3,445,911,992 cycles # 2.865 GHz + 7,714,670,946 instructions # 2.24 insn per cycle + 1.203790737 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.949365e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155182e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155182e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.138566 sec - 3,249,226,385 cycles # 2.842 GHz - 7,201,814,152 instructions # 2.22 insn per cycle - 1.144215776 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.050826e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.231385e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.231385e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.081579 sec + 3,176,085,947 cycles # 2.924 GHz + 7,108,143,250 instructions # 2.24 insn per cycle + 1.087328908 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.332937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.137553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.137553e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.813979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.730921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.730921e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.516868 sec - 3,052,773,549 cycles # 2.006 GHz - 5,834,286,886 instructions # 1.91 insn per cycle - 1.522319115 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.425356 sec + 2,984,093,084 cycles # 2.087 GHz + 5,763,176,682 instructions # 1.93 insn per cycle + 1.431417075 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index bfc5cc0709..3920589722 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:18:07 +DATE: 2024-01-26_00:00:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.776083e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.627754e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.948386e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.971445e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.633028e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951519e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.618119 sec - 2,443,289,573 cycles # 2.920 GHz - 3,806,362,954 instructions # 1.56 insn per cycle - 0.895384570 seconds time elapsed +TOTAL : 0.605308 sec + 2,490,356,687 cycles # 3.020 GHz + 3,879,865,044 instructions # 1.56 insn per cycle + 0.883165296 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.285771e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.360708e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.360708e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.371842e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.450351e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.450351e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.672900 sec - 14,007,447,781 cycles # 2.995 GHz - 38,341,188,487 instructions # 2.74 insn per cycle - 4.678325774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.505383 sec + 13,893,504,068 cycles # 3.081 GHz + 37,078,758,053 instructions # 2.67 insn per cycle + 4.511346462 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -95,26 +95,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.077746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.495127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.495127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.454645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.940983e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.940983e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.152723 sec - 6,468,101,195 cycles # 2.999 GHz - 15,815,386,942 instructions # 2.45 insn per cycle - 2.158120643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.007427 sec + 6,155,242,843 cycles # 3.058 GHz + 15,210,679,421 instructions # 2.47 insn per cycle + 2.013794020 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -122,26 +122,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.236653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062344e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062344e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.220471 sec - 3,470,157,914 cycles # 2.832 GHz - 7,593,755,062 instructions # 2.19 insn per cycle - 1.225848326 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.721991e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117822e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117822e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.161483 sec + 3,441,215,224 cycles # 2.950 GHz + 7,714,556,442 instructions # 2.24 insn per cycle + 1.167386391 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -149,26 +149,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.915047e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.151420e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.151420e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.142912 sec - 3,256,437,189 cycles # 2.838 GHz - 7,202,634,682 instructions # 2.21 insn per cycle - 1.148660140 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.065655e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.245864e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245864e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.066868 sec + 3,168,142,767 cycles # 2.956 GHz + 7,108,351,170 instructions # 2.24 insn per cycle + 1.072831056 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,26 +176,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.280239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.088898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.088898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.818298e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.735774e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.735774e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.526066 sec - 3,058,452,115 cycles # 1.998 GHz - 5,834,337,483 instructions # 1.91 insn per cycle - 1.531519039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.424324 sec + 2,976,782,899 cycles # 2.083 GHz + 5,762,338,734 instructions # 1.94 insn per cycle + 1.430396985 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -203,8 +203,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 5c8f13a099..43203bd7a4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:32:21 +DATE: 2024-01-25_23:05:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.367052e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.685993e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.017698e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.282862e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.624392e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.015642e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.471621 sec - 2,109,591,981 cycles # 3.033 GHz - 3,025,510,144 instructions # 1.43 insn per cycle - 0.774479825 seconds time elapsed +TOTAL : 0.480823 sec + 2,102,693,968 cycles # 2.972 GHz + 2,999,740,623 instructions # 1.43 insn per cycle + 0.778832349 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.266296e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.337160e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.337160e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.386604e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.466058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.466058e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.710609 sec - 14,404,518,200 cycles # 3.055 GHz - 39,833,821,591 instructions # 2.77 insn per cycle - 4.718963135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.477679 sec + 13,798,927,191 cycles # 3.078 GHz + 37,479,360,958 instructions # 2.72 insn per cycle + 4.487052345 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199028000236 -Relative difference = 4.790961076489297e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.070167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.657007e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.657007e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.180823e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.797776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.797776e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.811511 sec - 5,577,453,918 cycles # 3.070 GHz - 15,285,624,341 instructions # 2.74 insn per cycle - 1.824268128 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.779733 sec + 5,483,084,854 cycles # 3.071 GHz + 15,244,773,755 instructions # 2.78 insn per cycle + 1.794293430 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.861603e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.557183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.557183e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.611040 sec - 4,742,441,153 cycles # 2.934 GHz - 9,735,233,054 instructions # 2.05 insn per cycle - 1.623131424 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.879556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.587809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.587809e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.608086 sec + 4,709,912,951 cycles # 2.918 GHz + 9,849,751,420 instructions # 2.09 insn per cycle + 1.622779833 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.988259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.740532e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.740532e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.584956 sec - 4,623,129,145 cycles # 2.906 GHz - 9,326,921,854 instructions # 2.02 insn per cycle - 1.599836475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.215676e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.011410e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.011410e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.537749 sec + 4,488,857,938 cycles # 2.908 GHz + 9,201,595,147 instructions # 2.05 insn per cycle + 1.552559223 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.277364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.853368e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.853368e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.585161e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.226147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.226147e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.753984 sec - 3,644,734,123 cycles # 2.072 GHz - 7,034,553,715 instructions # 1.93 insn per cycle - 1.766522849 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) +TOTAL : 1.676187 sec + 3,452,216,154 cycles # 2.052 GHz + 6,874,474,606 instructions # 1.99 insn per cycle + 1.693691111 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183459779248 -Relative difference = 1.7053177021099307e-07 +Avg ME (F77/C++) = 2.0288183217635378 +Relative difference = 1.5859655131013432e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 665123002a..f4ee00fe61 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:01:29 +DATE: 2024-01-25_23:44:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.406259e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.654901e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.974381e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.367806e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.645974e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.965500e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478015 sec - 2,052,116,202 cycles # 2.923 GHz - 2,932,366,143 instructions # 1.43 insn per cycle - 0.759588643 seconds time elapsed +TOTAL : 0.472161 sec + 2,109,738,277 cycles # 3.018 GHz + 2,952,634,070 instructions # 1.40 insn per cycle + 0.756674168 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.529261e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.621856e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.621856e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.689089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.789055e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.789055e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.232103 sec - 12,595,483,741 cycles # 2.973 GHz - 34,372,550,033 instructions # 2.73 insn per cycle - 4.237616805 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.983379 sec + 12,403,709,488 cycles # 3.110 GHz + 34,216,718,460 instructions # 2.76 insn per cycle + 3.989754240 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288199088536203 +Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.409282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.897832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.897832e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.382450e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.036660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.036660e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.025230 sec - 6,107,661,405 cycles # 3.009 GHz - 14,860,133,021 instructions # 2.43 insn per cycle - 2.031040801 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.727265 sec + 5,355,354,456 cycles # 3.092 GHz + 14,586,803,552 instructions # 2.72 insn per cycle + 1.733316914 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193803280592 -Relative difference = 1.8746278463897685e-07 +Avg ME (F77/C++) = 2.0288192580919713 +Relative difference = 1.2721291123071246e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.413017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.265518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.265518e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.499210 sec - 4,274,397,689 cycles # 2.842 GHz - 9,028,681,209 instructions # 2.11 insn per cycle - 1.505009031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.057917e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.061528e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.061528e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.385709 sec + 4,060,564,696 cycles # 2.919 GHz + 9,087,730,757 instructions # 2.24 insn per cycle + 1.391973829 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.545085e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.419779e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.419779e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.475217 sec - 4,206,901,352 cycles # 2.842 GHz - 8,664,206,183 instructions # 2.06 insn per cycle - 1.480947152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.660623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.845013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.845013e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.294701 sec + 3,804,324,661 cycles # 2.927 GHz + 8,440,322,737 instructions # 2.22 insn per cycle + 1.301108243 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.444457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.903029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.903029e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.034946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.574893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.574893e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.015920 sec - 3,844,199,069 cycles # 1.909 GHz - 7,810,135,811 instructions # 2.03 insn per cycle - 2.021868576 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) +TOTAL : 1.822240 sec + 3,731,963,828 cycles # 2.043 GHz + 7,571,756,363 instructions # 2.03 insn per cycle + 1.828579723 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183246739209 -Relative difference = 1.6003107281264138e-07 +Avg ME (F77/C++) = 2.0288183350348845 +Relative difference = 1.6513796936156652e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index fa97bf17a1..4e1115173b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:01:53 +DATE: 2024-01-25_23:44:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.425807e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.669791e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.006545e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.456016e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.710724e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.042438e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480700 sec - 2,057,638,891 cycles # 2.914 GHz - 2,939,721,478 instructions # 1.43 insn per cycle - 0.763589412 seconds time elapsed +TOTAL : 0.480867 sec + 2,014,288,944 cycles # 2.853 GHz + 2,823,952,881 instructions # 1.40 insn per cycle + 0.764919690 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.734181e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.846132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.846132e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.800465e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.909661e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.909661e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.923431 sec - 11,754,442,627 cycles # 2.992 GHz - 35,109,278,353 instructions # 2.99 insn per cycle - 3.929236394 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.827874 sec + 11,937,612,158 cycles # 3.115 GHz + 35,406,088,073 instructions # 2.97 insn per cycle + 3.834430542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288199088536203 +Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.529892e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.033657e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.033657e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.762722e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.495247e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.495247e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.982127 sec - 5,950,643,888 cycles # 2.995 GHz - 14,469,850,336 instructions # 2.43 insn per cycle - 1.987737448 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.633478 sec + 5,067,221,535 cycles # 3.092 GHz + 14,044,637,271 instructions # 2.77 insn per cycle + 1.639618429 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193583255634 -Relative difference = 1.7661780742548925e-07 +Avg ME (F77/C++) = 2.0288192554144189 +Relative difference = 1.2589315209891237e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.598308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.516031e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.516031e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.464407 sec - 4,161,674,169 cycles # 2.832 GHz - 8,874,846,514 instructions # 2.13 insn per cycle - 1.469970354 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.247761e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.298779e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.298779e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.353889 sec + 3,995,635,648 cycles # 2.940 GHz + 8,628,877,597 instructions # 2.16 insn per cycle + 1.360027429 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.657230e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.559591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.559591e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.454835 sec - 4,142,234,613 cycles # 2.837 GHz - 8,411,473,385 instructions # 2.03 insn per cycle - 1.460527785 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.956843e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.022194e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.022194e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.253159 sec + 3,697,308,146 cycles # 2.938 GHz + 8,101,240,276 instructions # 2.19 insn per cycle + 1.259582611 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.747619e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.236667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.236667e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.976795e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.519153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.519153e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.909420 sec - 3,779,842,439 cycles # 1.975 GHz - 7,700,519,284 instructions # 2.04 insn per cycle - 1.914942288 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) +TOTAL : 1.839809 sec + 3,582,305,411 cycles # 1.942 GHz + 7,373,472,849 instructions # 2.06 insn per cycle + 1.845929573 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183204829693 -Relative difference = 1.5796536184903122e-07 +Avg ME (F77/C++) = 2.0288183569209650 +Relative difference = 1.7592557106041962e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index f4ab44e796..62e73e56f9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:32:45 +DATE: 2024-01-25_23:05:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.583584e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155538e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269480e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.042238e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.143081e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277579e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.521409 sec - 2,266,982,107 cycles # 3.000 GHz - 3,245,259,859 instructions # 1.43 insn per cycle - 0.825828714 seconds time elapsed +TOTAL : 0.529983 sec + 2,284,372,825 cycles # 2.981 GHz + 3,253,251,346 instructions # 1.42 insn per cycle + 0.837040452 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247909e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.177107e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.241482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241482e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.906531 sec - 15,256,207,680 cycles # 3.106 GHz - 38,576,110,907 instructions # 2.53 insn per cycle - 4.915147304 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.922213 sec + 15,202,220,905 cycles # 3.086 GHz + 39,294,318,145 instructions # 2.58 insn per cycle + 4.931576421 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.714814e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.918909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.918909e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.753886e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.970015e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.927400 sec - 8,963,531,758 cycles # 3.056 GHz - 24,224,066,775 instructions # 2.70 insn per cycle - 2.945563789 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.899375 sec + 8,841,932,542 cycles # 3.043 GHz + 24,092,972,952 instructions # 2.72 insn per cycle + 2.913268497 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.052463e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.591282e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.591282e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.715445e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.218183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.218183e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.836853 sec - 5,386,059,012 cycles # 2.923 GHz - 11,276,681,340 instructions # 2.09 insn per cycle - 1.853860147 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) +TOTAL : 1.941407 sec + 5,504,561,337 cycles # 2.827 GHz + 11,449,312,759 instructions # 2.08 insn per cycle + 1.955081745 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.743199e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.418159e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.418159e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.882817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.597475e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.597475e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.658034 sec - 4,871,841,569 cycles # 2.928 GHz - 10,526,597,351 instructions # 2.16 insn per cycle - 1.674830124 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) +TOTAL : 1.628926 sec + 4,780,392,865 cycles # 2.924 GHz + 10,317,148,509 instructions # 2.16 insn per cycle + 1.646633592 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.142731e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.384818e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.384818e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.601015e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.909270e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.909270e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.635826 sec - 5,341,588,845 cycles # 2.022 GHz - 7,603,619,006 instructions # 1.42 insn per cycle - 2.650860981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) +TOTAL : 2.383696 sec + 4,853,543,919 cycles # 2.031 GHz + 7,366,655,182 instructions # 1.52 insn per cycle + 2.397229409 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 261980075b..dbc018b658 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:33:12 +DATE: 2024-01-25_23:06:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.569187e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154655e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270741e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.027996e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136763e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273770e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.519942 sec - 2,236,595,831 cycles # 2.972 GHz - 3,152,119,637 instructions # 1.41 insn per cycle - 0.821038019 seconds time elapsed +TOTAL : 0.529349 sec + 2,275,344,800 cycles # 2.980 GHz + 3,194,805,823 instructions # 1.40 insn per cycle + 0.834615249 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.146577e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.209722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.209722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.197645e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.262543e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262543e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.990433 sec - 15,248,789,401 cycles # 3.053 GHz - 40,369,458,696 instructions # 2.65 insn per cycle - 4.998323590 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.876786 sec + 15,083,175,183 cycles # 3.091 GHz + 40,115,300,077 instructions # 2.66 insn per cycle + 4.885756664 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.998701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.233932e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.233932e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.836140e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.061195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.061195e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.726132 sec - 8,491,093,801 cycles # 3.108 GHz - 23,253,788,222 instructions # 2.74 insn per cycle - 2.739959213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.843009 sec + 8,673,476,327 cycles # 3.047 GHz + 23,533,368,418 instructions # 2.71 insn per cycle + 2.936629019 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.127833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.510579e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.510579e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.262735e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.669813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.669813e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.148617 sec - 6,247,078,338 cycles # 2.900 GHz - 12,963,549,380 instructions # 2.08 insn per cycle - 2.164643261 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) +TOTAL : 2.096332 sec + 6,163,295,525 cycles # 2.932 GHz + 13,102,492,963 instructions # 2.13 insn per cycle + 2.114571383 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.424850e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.870009e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.870009e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.546010e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.002876e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.002876e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.038674 sec - 5,929,446,799 cycles # 2.902 GHz - 12,241,198,193 instructions # 2.06 insn per cycle - 2.052944380 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) +TOTAL : 1.996809 sec + 5,749,993,886 cycles # 2.871 GHz + 12,209,923,535 instructions # 2.12 insn per cycle + 2.011417030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.928483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.143845e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.143845e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.204857e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.459679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.459679e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.772333 sec - 5,602,682,561 cycles # 2.017 GHz - 8,743,458,912 instructions # 1.56 insn per cycle - 2.786330437 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) +TOTAL : 2.599675 sec + 5,257,659,489 cycles # 2.018 GHz + 8,448,298,827 instructions # 1.61 insn per cycle + 2.614773289 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 157dda07e9..fa8caa938e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:33:40 +DATE: 2024-01-25_23:06:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.750385e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049934e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.064279e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.554064e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054520e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071264e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462904 sec - 2,042,714,082 cycles # 3.011 GHz - 2,902,296,650 instructions # 1.42 insn per cycle - 0.758295254 seconds time elapsed +TOTAL : 0.465030 sec + 2,048,991,704 cycles # 2.999 GHz + 2,945,132,376 instructions # 1.44 insn per cycle + 0.753228173 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.085304e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323684e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.337692e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.042508e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318801e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.335462e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603567 sec - 2,537,612,301 cycles # 3.015 GHz - 3,865,051,021 instructions # 1.52 insn per cycle - 0.900037358 seconds time elapsed +TOTAL : 0.608387 sec + 2,548,936,715 cycles # 3.003 GHz + 3,858,101,121 instructions # 1.51 insn per cycle + 0.909984749 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.562300e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.574662e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.574662e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.617017e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629887e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.629887e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.416503 sec - 19,738,905,705 cycles # 3.075 GHz - 59,604,379,345 instructions # 3.02 insn per cycle - 6.423169029 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.283295 sec + 19,499,856,239 cycles # 3.102 GHz + 57,919,782,342 instructions # 2.97 insn per cycle + 6.290250953 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.963631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.008800e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.008800e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.909036e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.954610e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.954610e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.322549 sec - 10,350,945,705 cycles # 3.112 GHz - 30,674,390,605 instructions # 2.96 insn per cycle - 3.334406966 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.360323 sec + 10,199,661,444 cycles # 3.037 GHz + 29,947,464,065 instructions # 2.94 insn per cycle + 3.374239300 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.645905e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.824459e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.824459e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.833419e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.001345e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.001345e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.720961 sec - 4,880,621,681 cycles # 2.829 GHz - 11,019,918,413 instructions # 2.26 insn per cycle - 1.732752875 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +TOTAL : 1.688645 sec + 4,909,126,113 cycles # 2.900 GHz + 11,211,204,678 instructions # 2.28 insn per cycle + 1.701281092 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.100615e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.123065e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.123065e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.121742e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.145009e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.145009e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.510594 sec - 4,368,423,747 cycles # 2.884 GHz - 10,296,629,918 instructions # 2.36 insn per cycle - 1.523579096 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) +TOTAL : 1.483031 sec + 4,297,587,774 cycles # 2.889 GHz + 10,187,383,914 instructions # 2.37 insn per cycle + 1.498495737 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.837255e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949435e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.949435e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.229760e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.353025e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.353025e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.114472 sec - 4,096,385,157 cycles # 1.934 GHz - 5,842,611,877 instructions # 1.43 insn per cycle - 2.127694185 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +TOTAL : 2.015215 sec + 3,913,493,676 cycles # 1.938 GHz + 5,708,818,352 instructions # 1.46 insn per cycle + 2.028873157 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 62c84d1195..096dd99876 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_15:11:49 +DATE: 2024-01-25_23:54:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.639935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.816863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.816863e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.637551e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.758962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.758962e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.492213 sec - 2,054,207,791 cycles # 2.926 GHz - 3,126,128,113 instructions # 1.52 insn per cycle - 0.760942013 seconds time elapsed +TOTAL : 0.489837 sec + 2,121,332,082 cycles # 3.002 GHz + 3,184,466,633 instructions # 1.50 insn per cycle + 0.765192095 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.653176e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.472944e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.472944e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.708039e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.519148e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.519148e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.830881 sec - 3,123,864,419 cycles # 2.883 GHz - 4,825,503,220 instructions # 1.54 insn per cycle - 1.142577785 seconds time elapsed +TOTAL : 0.826215 sec + 3,226,959,594 cycles # 2.979 GHz + 5,155,103,256 instructions # 1.60 insn per cycle + 1.143617843 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.499287e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.511764e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.511764e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.596028e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.608949e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.608949e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.585609 sec - 19,745,466,264 cycles # 2.998 GHz - 59,612,197,800 instructions # 3.02 insn per cycle - 6.590135647 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.338921 sec + 19,540,873,038 cycles # 3.081 GHz + 57,925,079,271 instructions # 2.96 insn per cycle + 6.344190271 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.693234e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.737492e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.737492e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.992522e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.039048e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.039048e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.521832 sec - 10,407,813,275 cycles # 2.952 GHz - 30,725,415,932 instructions # 2.95 insn per cycle - 3.526742122 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.310489 sec + 10,222,193,867 cycles # 3.084 GHz + 29,992,757,087 instructions # 2.93 insn per cycle + 3.315734573 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.553194e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.735067e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.735067e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.748259e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.928510e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.928510e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.745455 sec - 4,924,157,638 cycles # 2.815 GHz - 11,068,084,114 instructions # 2.25 insn per cycle - 1.749966089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +TOTAL : 1.709995 sec + 4,943,239,952 cycles # 2.885 GHz + 11,259,850,029 instructions # 2.28 insn per cycle + 1.714904879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -183,20 +183,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.070624e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092463e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092463e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.066367e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.090143e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090143e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.559525 sec - 4,401,889,175 cycles # 2.815 GHz - 10,345,025,900 instructions # 2.35 insn per cycle - 1.564094276 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) +TOTAL : 1.568862 sec + 4,349,115,858 cycles # 2.765 GHz + 10,237,621,087 instructions # 2.35 insn per cycle + 1.574339542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -211,20 +211,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.394172e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.507838e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.507838e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.164835e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.291624e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.291624e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.248960 sec - 4,150,304,517 cycles # 1.842 GHz - 5,879,795,808 instructions # 1.42 insn per cycle - 2.253443263 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +TOTAL : 2.037518 sec + 3,945,267,341 cycles # 1.933 GHz + 5,747,633,925 instructions # 1.46 insn per cycle + 2.042653419 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 739a090c03..25c18afcf9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:34:09 +DATE: 2024-01-25_23:06:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.713263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038994e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.053292e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.433352e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.037630e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053848e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.459803 sec - 2,029,299,480 cycles # 3.004 GHz - 2,918,522,159 instructions # 1.44 insn per cycle - 0.740886124 seconds time elapsed +TOTAL : 0.463370 sec + 2,015,424,656 cycles # 2.959 GHz + 2,929,966,501 instructions # 1.45 insn per cycle + 0.751673536 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.076194e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.311479e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325166e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036681e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.308916e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325873e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.596867 sec - 2,506,078,107 cycles # 3.006 GHz - 3,672,565,526 instructions # 1.47 insn per cycle - 0.892288871 seconds time elapsed +TOTAL : 0.603072 sec + 2,515,534,303 cycles # 2.981 GHz + 3,732,014,009 instructions # 1.48 insn per cycle + 0.904884829 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.614658e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.627547e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.627547e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.611397e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.624145e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.624145e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.288241 sec - 19,506,895,795 cycles # 3.101 GHz - 58,795,912,300 instructions # 3.01 insn per cycle - 6.294901652 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.297061 sec + 19,501,945,017 cycles # 3.096 GHz + 57,747,982,567 instructions # 2.96 insn per cycle + 6.304284821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.998673e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.044453e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.044453e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.963950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.009246e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.009246e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.299411 sec - 10,248,689,037 cycles # 3.103 GHz - 30,346,682,819 instructions # 2.96 insn per cycle - 3.310933910 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.323112 sec + 10,252,760,514 cycles # 3.082 GHz + 30,333,563,638 instructions # 2.96 insn per cycle + 3.334682355 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.591359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.761947e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.761947e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.543659e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.713856e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.713856e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.730287 sec - 5,042,983,525 cycles # 2.908 GHz - 11,483,783,187 instructions # 2.28 insn per cycle - 1.743862515 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) +TOTAL : 1.739150 sec + 5,054,703,175 cycles # 2.901 GHz + 11,664,593,433 instructions # 2.31 insn per cycle + 1.752926203 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.041838e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.061977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.038420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058455e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.594766 sec - 4,637,285,824 cycles # 2.900 GHz - 10,841,968,745 instructions # 2.34 insn per cycle - 1.604083789 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) +TOTAL : 1.600620 sec + 4,609,440,056 cycles # 2.871 GHz + 10,805,674,859 instructions # 2.34 insn per cycle + 1.612255696 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.531164e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.641315e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.641315e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.858559e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.976491e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.976491e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.200047 sec - 4,116,597,786 cycles # 1.868 GHz - 6,106,799,640 instructions # 1.48 insn per cycle - 2.213612018 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) +TOTAL : 2.109622 sec + 3,949,478,547 cycles # 1.868 GHz + 5,998,434,731 instructions # 1.52 insn per cycle + 2.131507197 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index ee258b9eb8..0b74a76420 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:34:38 +DATE: 2024-01-25_23:07:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.486813e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.354265e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453893e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.397686e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.380368e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494164e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.444260 sec - 1,976,391,201 cycles # 2.992 GHz - 2,802,325,698 instructions # 1.42 insn per cycle - 0.735914466 seconds time elapsed +TOTAL : 0.443485 sec + 1,988,977,095 cycles # 2.997 GHz + 2,794,054,594 instructions # 1.40 insn per cycle + 0.738229663 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.217656e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.394394e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.476515e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.039325e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.377418e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.475776e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.493371 sec - 2,193,458,897 cycles # 3.016 GHz - 3,142,646,768 instructions # 1.43 insn per cycle - 0.786643658 seconds time elapsed +TOTAL : 0.495765 sec + 2,163,616,138 cycles # 2.985 GHz + 3,081,644,651 instructions # 1.42 insn per cycle + 0.782078599 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,74 +86,74 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.645635e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.659236e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.659236e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.800506e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.815684e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.815684e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.214195 sec - 19,058,711,796 cycles # 3.065 GHz - 58,958,354,991 instructions # 3.09 insn per cycle - 6.220591504 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.871499 sec + 18,162,391,724 cycles # 3.091 GHz + 55,237,317,005 instructions # 3.04 insn per cycle + 5.878377310 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.795314e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.946425e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.946425e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.026560e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.187110e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.187110e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.882613 sec - 5,847,309,673 cycles # 3.099 GHz - 16,694,109,682 instructions # 2.86 insn per cycle - 1.898696341 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.835796 sec + 5,681,014,790 cycles # 3.088 GHz + 16,127,858,668 instructions # 2.84 insn per cycle + 1.846414018 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.901790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.970007e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.970007e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.894717e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.962920e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.962920e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.881272 sec - 2,581,984,725 cycles # 2.916 GHz - 5,980,308,545 instructions # 2.32 insn per cycle - 0.895784511 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +TOTAL : 0.885139 sec + 2,584,367,657 cycles # 2.906 GHz + 6,085,782,007 instructions # 2.35 insn per cycle + 0.899021976 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.105287e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.189516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.189516e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.135210e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.222813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.222813e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.797874 sec - 2,345,612,557 cycles # 2.925 GHz - 5,602,963,634 instructions # 2.39 insn per cycle - 0.810229657 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +TOTAL : 0.788022 sec + 2,287,899,951 cycles # 2.888 GHz + 5,552,661,648 instructions # 2.43 insn per cycle + 0.799868307 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.549899e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.597374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.597374e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.622321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.674923e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.674923e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.080161 sec - 2,054,065,161 cycles # 1.894 GHz - 3,334,275,922 instructions # 1.62 insn per cycle - 1.095623221 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) +TOTAL : 1.032955 sec + 2,017,528,772 cycles # 1.944 GHz + 3,285,966,058 instructions # 1.63 insn per cycle + 1.048446410 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index d8dc2c3678..dc0596d15b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_15:12:19 +DATE: 2024-01-25_23:54:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.001518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.137858e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.137858e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.062167e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.079545e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.455815 sec - 1,948,301,357 cycles # 2.927 GHz - 2,907,177,849 instructions # 1.49 insn per cycle - 0.723196787 seconds time elapsed +TOTAL : 0.452726 sec + 1,977,358,341 cycles # 2.985 GHz + 2,923,970,350 instructions # 1.48 insn per cycle + 0.720026423 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.698909e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.564625e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.564625e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.811088e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.588064e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.588064e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.639136 sec - 2,545,540,064 cycles # 2.932 GHz - 3,908,776,443 instructions # 1.54 insn per cycle - 0.927730276 seconds time elapsed +TOTAL : 0.632525 sec + 2,597,542,173 cycles # 3.015 GHz + 3,997,383,655 instructions # 1.54 insn per cycle + 0.921189224 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,76 +99,76 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.559629e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.572951e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.572951e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.787761e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.803517e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.803517e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.425977 sec - 19,091,912,214 cycles # 2.972 GHz - 58,967,324,889 instructions # 3.09 insn per cycle - 6.430306421 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.900987 sec + 18,196,746,451 cycles # 3.082 GHz + 55,241,449,113 instructions # 3.04 insn per cycle + 5.905790765 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.438378e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.588398e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.588398e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.748340e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.909538e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.909538e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.967455 sec - 5,879,551,760 cycles # 2.983 GHz - 16,741,960,213 instructions # 2.85 insn per cycle - 1.971863675 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.898315 sec + 5,708,476,501 cycles # 3.002 GHz + 16,175,955,794 instructions # 2.83 insn per cycle + 1.903472906 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.817320e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883646e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.884738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.952981e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.952981e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.926249 sec - 2,607,056,313 cycles # 2.804 GHz - 6,016,570,943 instructions # 2.31 insn per cycle - 0.930566113 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +TOTAL : 0.893745 sec + 2,602,515,487 cycles # 2.899 GHz + 6,121,386,159 instructions # 2.35 insn per cycle + 0.898505248 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.096022e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.096022e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.141965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.229819e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.229819e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.837462 sec - 2,364,091,843 cycles # 2.810 GHz - 5,638,984,109 instructions # 2.39 insn per cycle - 0.841852545 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +TOTAL : 0.789242 sec + 2,305,479,442 cycles # 2.907 GHz + 5,588,939,300 instructions # 2.42 insn per cycle + 0.794159548 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,27 +204,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.519186e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565700e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633816e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685769e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.105946 sec - 2,075,898,441 cycles # 1.871 GHz - 3,374,696,524 instructions # 1.63 insn per cycle - 1.110329864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) +TOTAL : 1.030236 sec + 2,031,108,987 cycles # 1.966 GHz + 3,327,076,453 instructions # 1.64 insn per cycle + 1.034960601 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 69970d8c55..71738afd73 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:35:02 +DATE: 2024-01-25_23:07:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.373914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.228922e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.325879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.345447e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.296112e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.402388e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.443366 sec - 2,003,938,604 cycles # 2.996 GHz - 2,792,184,039 instructions # 1.39 insn per cycle - 0.740353888 seconds time elapsed +TOTAL : 0.448536 sec + 1,925,765,115 cycles # 2.904 GHz + 2,794,221,799 instructions # 1.45 insn per cycle + 0.739665941 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.237434e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.431964e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.514879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.063360e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.429159e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.528806e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.491294 sec - 2,181,970,196 cycles # 3.018 GHz - 3,127,779,956 instructions # 1.43 insn per cycle - 0.781703991 seconds time elapsed +TOTAL : 0.496316 sec + 2,170,960,632 cycles # 2.997 GHz + 3,070,673,937 instructions # 1.41 insn per cycle + 0.782694833 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,74 +86,74 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.673738e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687597e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.687597e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.784835e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.799936e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.799936e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.148417 sec - 18,978,547,749 cycles # 3.085 GHz - 58,701,989,638 instructions # 3.09 insn per cycle - 6.154751887 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.904353 sec + 18,127,930,758 cycles # 3.068 GHz + 54,990,203,516 instructions # 3.03 insn per cycle + 5.911564510 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.234838e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.399257e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.399257e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.265549e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.433505e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.433505e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.793566 sec - 5,583,260,034 cycles # 3.106 GHz - 16,511,055,213 instructions # 2.96 insn per cycle - 1.805973554 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.788383 sec + 5,530,075,500 cycles # 3.085 GHz + 16,222,672,133 instructions # 2.93 insn per cycle + 1.800758208 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129863487235070 +Relative difference = 2.4679898241023883e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640420e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.690986e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.690986e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.641224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.691486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.691486e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.018575 sec - 2,975,461,813 cycles # 2.909 GHz - 6,634,551,571 instructions # 2.23 insn per cycle - 1.032313323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) +TOTAL : 1.018368 sec + 2,972,289,589 cycles # 2.906 GHz + 6,708,049,819 instructions # 2.26 insn per cycle + 1.031379068 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.779474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839054e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839054e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.801339e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.862952e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.862952e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.940459 sec - 2,751,956,582 cycles # 2.914 GHz - 6,255,845,013 instructions # 2.27 insn per cycle - 0.955633764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) +TOTAL : 0.930483 sec + 2,705,842,308 cycles # 2.895 GHz + 6,222,888,546 instructions # 2.30 insn per cycle + 0.943738964 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.472370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.513955e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.513955e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.521231e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565310e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.135039 sec - 2,225,305,054 cycles # 1.953 GHz - 3,698,319,936 instructions # 1.66 insn per cycle - 1.145678367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) +TOTAL : 1.099011 sec + 2,151,243,651 cycles # 1.950 GHz + 3,642,085,891 instructions # 1.69 insn per cycle + 1.111518081 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index ecdaa6bbe5..763ed418db 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:35:27 +DATE: 2024-01-25_23:08:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.701296e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.042819e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.057341e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.452662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.040665e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056694e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460305 sec - 2,033,483,355 cycles # 3.015 GHz - 2,914,097,696 instructions # 1.43 insn per cycle - 0.748064785 seconds time elapsed +TOTAL : 0.465162 sec + 2,037,507,117 cycles # 2.995 GHz + 2,884,960,920 instructions # 1.42 insn per cycle + 0.752471370 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.081681e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313999e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.327824e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039801e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309901e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326601e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.605547 sec - 2,520,007,236 cycles # 2.997 GHz - 3,773,324,991 instructions # 1.50 insn per cycle - 0.902409459 seconds time elapsed +TOTAL : 0.609431 sec + 2,555,228,465 cycles # 3.005 GHz + 3,858,150,049 instructions # 1.51 insn per cycle + 0.910318180 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.516282e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.528433e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.528433e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.552046e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.564457e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.564457e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.534539 sec - 20,013,949,237 cycles # 3.061 GHz - 60,533,001,946 instructions # 3.02 insn per cycle - 6.541112859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.443119 sec + 19,943,249,308 cycles # 3.094 GHz + 59,159,929,401 instructions # 2.97 insn per cycle + 6.450554622 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.912613e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.958884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.958884e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.970824e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.018053e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.018053e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.357235 sec - 10,182,360,341 cycles # 3.029 GHz - 30,384,902,169 instructions # 2.98 insn per cycle - 3.367308670 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.318511 sec + 10,090,639,333 cycles # 3.042 GHz + 29,766,389,850 instructions # 2.95 insn per cycle + 3.349269539 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.926374e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.010833e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.010833e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.888460e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006978e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.672181 sec - 4,864,081,957 cycles # 2.901 GHz - 10,979,232,419 instructions # 2.26 insn per cycle - 1.684140082 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) +TOTAL : 1.679372 sec + 4,872,134,956 cycles # 2.894 GHz + 11,200,375,088 instructions # 2.30 insn per cycle + 1.692620147 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.137450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161103e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.161103e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.144061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.168103e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.168103e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.462186 sec - 4,272,825,074 cycles # 2.914 GHz - 10,248,981,944 instructions # 2.40 insn per cycle - 1.474897364 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) +TOTAL : 1.455389 sec + 4,227,854,530 cycles # 2.898 GHz + 10,145,979,716 instructions # 2.40 insn per cycle + 1.472361150 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.677106e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.786537e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.786537e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.055174e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.172416e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.172416e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.158029 sec - 4,199,815,524 cycles # 1.943 GHz - 6,043,516,368 instructions # 1.44 insn per cycle - 2.169397232 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) +TOTAL : 2.058389 sec + 3,995,721,738 cycles # 1.937 GHz + 5,838,323,837 instructions # 1.46 insn per cycle + 2.074873076 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index f9629437a1..d3b29c0fa7 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:35:56 +DATE: 2024-01-25_23:08:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.718207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043711e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058000e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.426102e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039267e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056306e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460190 sec - 2,020,498,162 cycles # 3.001 GHz - 2,893,283,708 instructions # 1.43 insn per cycle - 0.740481359 seconds time elapsed +TOTAL : 0.462722 sec + 2,075,753,099 cycles # 3.015 GHz + 2,960,136,278 instructions # 1.43 insn per cycle + 0.765500799 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.069988e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.302141e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.315703e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.031468e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.301962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318830e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603031 sec - 2,496,488,169 cycles # 2.975 GHz - 3,745,992,997 instructions # 1.50 insn per cycle - 0.898963383 seconds time elapsed +TOTAL : 0.601863 sec + 2,564,202,637 cycles # 3.038 GHz + 3,829,571,666 instructions # 1.49 insn per cycle + 0.903658002 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.573744e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.586335e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.586335e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.538356e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.550809e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.550809e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.388339 sec - 19,817,061,675 cycles # 3.101 GHz - 59,934,412,865 instructions # 3.02 insn per cycle - 6.394993907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.478032 sec + 19,734,523,883 cycles # 3.045 GHz + 58,710,623,892 instructions # 2.98 insn per cycle + 6.485434165 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.068721e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.116534e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.116534e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.902025e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.948991e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.948991e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.254189 sec - 10,074,392,918 cycles # 3.092 GHz - 30,097,970,506 instructions # 2.99 insn per cycle - 3.269644857 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.364695 sec + 10,117,380,544 cycles # 3.006 GHz + 30,160,733,453 instructions # 2.98 insn per cycle + 3.381394083 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.551013e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.718637e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.718637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.506132e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.674497e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.674497e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.737428 sec - 5,013,997,017 cycles # 2.879 GHz - 11,483,477,844 instructions # 2.29 insn per cycle - 1.752998506 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) +TOTAL : 1.745813 sec + 5,023,045,767 cycles # 2.870 GHz + 11,663,258,678 instructions # 2.32 insn per cycle + 1.757870215 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052618e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072863e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072863e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.061888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.082266e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.082266e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.578372 sec - 4,590,458,207 cycles # 2.900 GHz - 10,810,507,291 instructions # 2.35 insn per cycle - 1.591783419 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) +TOTAL : 1.564942 sec + 4,539,640,714 cycles # 2.894 GHz + 10,787,225,686 instructions # 2.38 insn per cycle + 1.577778818 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.627027e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731488e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731488e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.937035e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053909e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053909e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.171750 sec - 4,214,575,835 cycles # 1.937 GHz - 6,273,425,611 instructions # 1.49 insn per cycle - 2.183570227 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) +TOTAL : 2.088743 sec + 4,049,955,561 cycles # 1.935 GHz + 6,072,727,098 instructions # 1.50 insn per cycle + 2.103091451 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 9b25288b8a..1b975999f0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:36:25 +DATE: 2024-01-25_23:09:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.502935e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.531164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533426e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.469665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.503353e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505822e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519828 sec - 2,267,325,261 cycles # 3.026 GHz - 3,564,664,052 instructions # 1.57 insn per cycle - 0.820257033 seconds time elapsed +TOTAL : 0.525454 sec + 2,304,049,489 cycles # 3.008 GHz + 3,585,827,144 instructions # 1.56 insn per cycle + 0.838808541 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.129197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.163245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164643e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.130028e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.173120e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.029980 sec - 10,118,272,870 cycles # 3.068 GHz - 21,084,266,771 instructions # 2.08 insn per cycle - 3.354068412 seconds time elapsed +TOTAL : 3.036213 sec + 10,025,778,991 cycles # 3.041 GHz + 20,025,933,583 instructions # 2.00 insn per cycle + 3.353883863 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.983194e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.984194e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.984194e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.903450e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904314e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904314e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.279983 sec - 25,558,009,617 cycles # 3.086 GHz - 78,936,663,909 instructions # 3.09 insn per cycle - 8.286888065 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.626486 sec + 26,438,113,457 cycles # 3.066 GHz + 81,756,024,237 instructions # 3.09 insn per cycle + 8.633880208 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.793332e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.796868e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.796868e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.782602e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786067e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786067e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.332856 sec - 12,910,305,211 cycles # 2.977 GHz - 39,281,355,341 instructions # 3.04 insn per cycle - 4.346559861 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.345078 sec + 12,949,115,165 cycles # 2.978 GHz + 39,243,323,450 instructions # 3.03 insn per cycle + 4.361137024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.457803e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.475319e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.475319e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.583787e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.601455e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.601455e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.950206 sec - 5,559,743,200 cycles # 2.847 GHz - 13,686,596,932 instructions # 2.46 insn per cycle - 2.032901077 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 1.920616 sec + 5,564,908,521 cycles # 2.891 GHz + 13,789,734,974 instructions # 2.48 insn per cycle + 1.936475869 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.800481e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.822076e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.822076e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.734883e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.757336e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.757336e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.683777 sec - 4,896,997,214 cycles # 2.903 GHz - 12,341,617,992 instructions # 2.52 insn per cycle - 1.697194330 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.694453 sec + 4,890,498,518 cycles # 2.879 GHz + 12,318,491,698 instructions # 2.52 insn per cycle + 1.710599834 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.669824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.683748e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.683748e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.797734e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.812832e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.812832e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.147707 sec - 4,120,375,944 cycles # 1.915 GHz - 6,335,709,132 instructions # 1.54 insn per cycle - 2.163638241 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.113063 sec + 4,049,846,106 cycles # 1.913 GHz + 6,286,795,753 instructions # 1.55 insn per cycle + 2.127195558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 3e466c9dbd..27f7e20ca2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:13:18 +DATE: 2024-01-25_23:55:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.151810e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491547e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.491547e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.145488e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497122e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497122e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.513888 sec - 2,172,731,633 cycles # 2.929 GHz - 3,459,636,945 instructions # 1.59 insn per cycle - 0.802070844 seconds time elapsed +TOTAL : 0.509145 sec + 2,222,116,978 cycles # 3.014 GHz + 3,541,651,897 instructions # 1.59 insn per cycle + 0.797289210 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.626462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109948e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.636792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.102264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.102264e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.316369 sec - 10,702,439,296 cycles # 2.980 GHz - 23,311,698,840 instructions # 2.18 insn per cycle - 3.648833219 seconds time elapsed +TOTAL : 3.311683 sec + 10,993,373,303 cycles # 3.071 GHz + 22,632,188,932 instructions # 2.06 insn per cycle + 3.639416589 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.926732e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927660e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927660e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.907265e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.908117e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.908117e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.525962 sec - 25,585,926,017 cycles # 3.000 GHz - 78,942,573,288 instructions # 3.09 insn per cycle - 8.530463803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.612943 sec + 26,477,322,279 cycles # 3.073 GHz + 81,758,457,566 instructions # 3.09 insn per cycle + 8.618373177 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -127,20 +127,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.644154e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.647807e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.647807e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.736047e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.739396e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.739396e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.514194 sec - 12,913,516,064 cycles # 2.858 GHz - 39,293,095,640 instructions # 3.04 insn per cycle - 4.518821334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.404587 sec + 12,949,218,628 cycles # 2.939 GHz + 39,253,017,453 instructions # 3.03 insn per cycle + 4.409630473 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.272317e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.289989e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.289989e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.544040e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.561904e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.561904e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.996165 sec - 5,572,431,089 cycles # 2.786 GHz - 13,696,683,094 instructions # 2.46 insn per cycle - 2.000824809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 1.933255 sec + 5,589,610,637 cycles # 2.885 GHz + 13,799,041,950 instructions # 2.47 insn per cycle + 1.938578158 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.416662e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.438491e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.438491e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.759915e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.783050e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.783050e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.755183 sec - 4,929,995,053 cycles # 2.803 GHz - 12,351,698,776 instructions # 2.51 insn per cycle - 1.759689895 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.694288 sec + 4,905,478,806 cycles # 2.889 GHz + 12,328,602,147 instructions # 2.51 insn per cycle + 1.699386329 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,27 +204,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.299463e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.313710e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.313710e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.754851e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.770279e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.770279e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.260606 sec - 4,129,998,329 cycles # 1.824 GHz - 6,345,886,658 instructions # 1.54 insn per cycle - 2.265252390 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.128740 sec + 4,078,758,332 cycles # 1.912 GHz + 6,296,587,445 instructions # 1.54 insn per cycle + 2.133849808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 22d785bafe..cd95d96bbf 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:25:10 +DATE: 2024-01-26_00:07:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.505435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534142e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536887e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.492316e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.521271e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.523609e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.512729 sec - 2,187,113,796 cycles # 2.953 GHz - 3,321,864,960 instructions # 1.52 insn per cycle - 0.803004683 seconds time elapsed +TOTAL : 0.507003 sec + 2,206,717,561 cycles # 3.001 GHz + 3,448,725,330 instructions # 1.56 insn per cycle + 0.796732185 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.143311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.179712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.143192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177337e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178848e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.127476 sec - 10,173,806,494 cycles # 3.006 GHz - 22,940,336,221 instructions # 2.25 insn per cycle - 3.440988376 seconds time elapsed +TOTAL : 3.127410 sec + 10,316,808,786 cycles # 3.054 GHz + 22,423,079,541 instructions # 2.17 insn per cycle + 3.438326791 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.944681e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.945656e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945656e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.906516e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.907436e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.907436e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.444658 sec - 25,580,623,755 cycles # 3.028 GHz - 78,936,122,811 instructions # 3.09 insn per cycle - 8.449050059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.613443 sec + 26,433,384,953 cycles # 3.068 GHz + 81,751,741,374 instructions # 3.09 insn per cycle + 8.618094360 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.654154e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.657693e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.657693e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.802273e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.805901e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.805901e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.499225 sec - 12,919,732,335 cycles # 2.870 GHz - 39,280,003,953 instructions # 3.04 insn per cycle - 4.503503119 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.325072 sec + 12,937,290,700 cycles # 2.989 GHz + 39,240,011,387 instructions # 3.03 insn per cycle + 4.329794435 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.335985e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.352449e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.352449e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.589458e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.608319e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.608319e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.978535 sec - 5,565,762,005 cycles # 2.808 GHz - 13,684,790,936 instructions # 2.46 insn per cycle - 1.982835811 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 1.920532 sec + 5,571,487,226 cycles # 2.896 GHz + 13,788,001,887 instructions # 2.47 insn per cycle + 1.925207623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.460859e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.482779e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.482779e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.699010e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.723517e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.723517e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.744747 sec - 4,901,917,953 cycles # 2.804 GHz - 12,339,116,177 instructions # 2.52 insn per cycle - 1.749045721 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.701918 sec + 4,894,734,524 cycles # 2.870 GHz + 12,315,493,193 instructions # 2.52 insn per cycle + 1.706709534 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.321634e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.334985e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.334985e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.766135e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.780590e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.780590e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.251368 sec - 4,121,273,998 cycles # 1.828 GHz - 6,332,363,558 instructions # 1.54 insn per cycle - 2.255694764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.122969 sec + 4,050,408,054 cycles # 1.905 GHz + 6,283,219,779 instructions # 1.55 insn per cycle + 2.127795422 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 0edaf6e67f..e8c507cdee 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:21:49 +DATE: 2024-01-26_00:04:11 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.504189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.533338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.535687e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.478378e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.505566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.508071e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.507565 sec - 2,140,644,436 cycles # 2.903 GHz - 3,294,278,423 instructions # 1.54 insn per cycle - 0.797247733 seconds time elapsed +TOTAL : 0.504525 sec + 2,215,801,883 cycles # 2.980 GHz + 3,455,759,890 instructions # 1.56 insn per cycle + 0.808082050 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.168777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.170193e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.137368e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.173067e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.073697 sec - 9,988,823,734 cycles # 2.997 GHz - 22,688,491,791 instructions # 2.27 insn per cycle - 3.391117560 seconds time elapsed +TOTAL : 3.063157 sec + 10,144,716,887 cycles # 3.058 GHz + 21,058,700,369 instructions # 2.08 insn per cycle + 3.374375541 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.920556e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.921467e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921467e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.912677e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.913545e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.913545e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.549047 sec - 25,566,043,933 cycles # 2.990 GHz - 78,938,822,238 instructions # 3.09 insn per cycle - 8.553277802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.585286 sec + 26,459,308,315 cycles # 3.081 GHz + 81,751,492,157 instructions # 3.09 insn per cycle + 8.590088729 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.702947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.706522e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.706522e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.797442e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.801061e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801061e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.438616 sec - 12,919,298,910 cycles # 2.909 GHz - 39,279,556,753 instructions # 3.04 insn per cycle - 4.442989895 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.329041 sec + 12,934,718,239 cycles # 2.986 GHz + 39,240,868,450 instructions # 3.03 insn per cycle + 4.333889790 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.327524e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.345264e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.345264e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.580291e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.598151e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.598151e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.979008 sec - 5,559,052,885 cycles # 2.804 GHz - 13,685,718,134 instructions # 2.46 insn per cycle - 1.983236201 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 1.920867 sec + 5,567,876,605 cycles # 2.893 GHz + 13,788,026,591 instructions # 2.48 insn per cycle + 1.925509830 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.547712e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.570537e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.570537e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.717286e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.740439e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.740439e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.726945 sec - 4,895,458,327 cycles # 2.829 GHz - 12,340,699,922 instructions # 2.52 insn per cycle - 1.731178551 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.697305 sec + 4,890,738,669 cycles # 2.875 GHz + 12,317,303,230 instructions # 2.52 insn per cycle + 1.702130241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.393984e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.407662e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.407662e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.800846e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.815791e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.815791e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.227409 sec - 4,117,035,910 cycles # 1.846 GHz - 6,334,322,153 instructions # 1.54 insn per cycle - 2.231712529 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.112333 sec + 4,045,690,393 cycles # 1.912 GHz + 6,285,044,805 instructions # 1.55 insn per cycle + 2.116928860 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 3fd3545f79..52fdd67f88 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:18:31 +DATE: 2024-01-26_00:00:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.205483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.519039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.521349e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.217958e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512930e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.515214e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.513661 sec - 2,155,662,048 cycles # 2.911 GHz - 3,403,110,868 instructions # 1.58 insn per cycle - 0.802490530 seconds time elapsed +TOTAL : 0.506889 sec + 2,233,510,708 cycles # 3.038 GHz + 3,515,215,151 instructions # 1.57 insn per cycle + 0.795257948 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -68,17 +68,17 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.731558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.172534e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.741390e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.169861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.171330e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.204951 sec - 10,303,132,790 cycles # 2.977 GHz - 23,518,550,603 instructions # 2.28 insn per cycle - 3.516665455 seconds time elapsed +TOTAL : 3.194603 sec + 10,409,856,467 cycles # 3.018 GHz + 23,237,283,726 instructions # 2.23 insn per cycle + 3.506487413 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -89,20 +89,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.916291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.917249e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.917249e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920097e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.920994e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.920994e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.568789 sec - 25,583,500,760 cycles # 2.987 GHz - 78,939,465,924 instructions # 3.09 insn per cycle - 8.573085981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.551552 sec + 26,447,790,271 cycles # 3.092 GHz + 81,753,854,285 instructions # 3.09 insn per cycle + 8.556393713 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -116,20 +116,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.689818e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.693400e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.693400e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.752850e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.756405e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.756405e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.453935 sec - 12,901,075,488 cycles # 2.894 GHz - 39,279,441,065 instructions # 3.04 insn per cycle - 4.458436593 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.380066 sec + 12,930,382,850 cycles # 2.950 GHz + 39,242,708,718 instructions # 3.03 insn per cycle + 4.384871242 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -143,20 +143,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.313071e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.330635e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.330635e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.576151e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.593998e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.593998e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.982693 sec - 5,557,103,226 cycles # 2.798 GHz - 13,686,016,474 instructions # 2.46 insn per cycle - 1.987071584 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 1.922112 sec + 5,560,568,053 cycles # 2.887 GHz + 13,787,937,924 instructions # 2.48 insn per cycle + 1.926945796 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -164,26 +164,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.334052e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.355862e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.355862e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.812534e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.836405e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.836405e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.766820 sec - 4,905,253,884 cycles # 2.771 GHz - 12,341,018,898 instructions # 2.52 insn per cycle - 1.771164761 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.681289 sec + 4,888,148,053 cycles # 2.901 GHz + 12,317,061,785 instructions # 2.52 insn per cycle + 1.685971625 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -191,26 +191,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.271568e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.284665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.284665e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.789889e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.805147e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.805147e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.265152 sec - 4,121,020,497 cycles # 1.817 GHz - 6,334,252,603 instructions # 1.54 insn per cycle - 2.269401421 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.115068 sec + 4,046,960,669 cycles # 1.910 GHz + 6,285,467,635 instructions # 1.55 insn per cycle + 2.119996341 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -218,8 +218,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 98140f2185..4f02b865b7 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:37:01 +DATE: 2024-01-25_23:09:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.470716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.498661e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.471206e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.504777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.507351e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519808 sec - 2,262,549,095 cycles # 3.015 GHz - 3,507,447,767 instructions # 1.55 insn per cycle - 0.822215819 seconds time elapsed +TOTAL : 0.523241 sec + 2,294,977,837 cycles # 2.990 GHz + 3,355,752,268 instructions # 1.46 insn per cycle + 0.839300855 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.130084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164129e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165510e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.146612e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.187782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.189484e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.030552 sec - 10,073,054,045 cycles # 3.064 GHz - 22,850,649,130 instructions # 2.27 insn per cycle - 3.344535702 seconds time elapsed +TOTAL : 3.024342 sec + 10,116,132,856 cycles # 3.082 GHz + 21,831,790,705 instructions # 2.16 insn per cycle + 3.341530087 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.990080e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.991047e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.991047e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.904926e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.905851e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905851e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.251874 sec - 25,580,916,000 cycles # 3.099 GHz - 78,707,540,592 instructions # 3.08 insn per cycle - 8.258581252 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.620332 sec + 26,463,781,685 cycles # 3.069 GHz + 81,778,380,367 instructions # 3.09 insn per cycle + 8.627766180 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.671887e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674997e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674997e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.778600e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.781982e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781982e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.475213 sec - 12,954,555,110 cycles # 2.897 GHz - 39,230,579,465 instructions # 3.03 insn per cycle - 4.487081742 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.350007 sec + 12,914,528,246 cycles # 2.966 GHz + 39,248,322,763 instructions # 3.04 insn per cycle + 4.364307061 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.529282e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.546223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.546223e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.584212e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.601630e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.601630e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.932123 sec - 5,603,928,357 cycles # 2.895 GHz - 13,800,807,908 instructions # 2.46 insn per cycle - 1.945155802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) +TOTAL : 1.921217 sec + 5,553,864,604 cycles # 2.887 GHz + 13,804,830,516 instructions # 2.49 insn per cycle + 1.935543419 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.635530e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.657625e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.657625e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.656517e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.679828e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.679828e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.712282 sec - 4,956,507,680 cycles # 2.889 GHz - 12,466,691,438 instructions # 2.52 insn per cycle - 1.725471807 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) +TOTAL : 1.708041 sec + 4,906,794,461 cycles # 2.867 GHz + 12,330,114,831 instructions # 2.51 insn per cycle + 1.722730362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.639925e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.653575e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.653575e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.785044e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.799232e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.799232e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.155986 sec - 4,118,224,744 cycles # 1.907 GHz - 6,458,752,156 instructions # 1.57 insn per cycle - 2.169032093 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) +TOTAL : 2.116475 sec + 4,044,956,882 cycles # 1.907 GHz + 6,292,510,947 instructions # 1.56 insn per cycle + 2.132272886 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 9bedb53e70..a61b82479f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:02:17 +DATE: 2024-01-25_23:44:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.232352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.258036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.260266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222012e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.246324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.248217e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.533076 sec - 2,264,463,020 cycles # 2.941 GHz - 3,538,774,343 instructions # 1.56 insn per cycle - 0.826602256 seconds time elapsed +TOTAL : 0.530629 sec + 2,302,571,140 cycles # 3.026 GHz + 3,539,966,337 instructions # 1.54 insn per cycle + 0.818475146 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.769231e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.796782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.766260e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.794821e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.795979e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.296720 sec - 10,445,857,222 cycles # 2.932 GHz - 21,607,313,168 instructions # 2.07 insn per cycle - 3.618781692 seconds time elapsed +TOTAL : 3.302210 sec + 10,996,851,827 cycles # 3.096 GHz + 24,989,138,728 instructions # 2.27 insn per cycle + 3.612162016 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.334345e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.334819e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.334819e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.459096e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.459599e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.459599e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.848381 sec - 113,421,106,616 cycles # 2.997 GHz - 144,959,874,698 instructions # 1.28 insn per cycle - 37.852852480 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 36.792656 sec + 113,070,573,577 cycles # 3.074 GHz + 141,524,951,428 instructions # 1.25 insn per cycle + 36.797608137 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.164523e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.167012e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.167012e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.314974e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.317691e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.317691e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.191844 sec - 14,732,312,446 cycles # 2.836 GHz - 37,575,149,913 instructions # 2.55 insn per cycle - 5.196266852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.957393 sec + 14,921,471,789 cycles # 3.008 GHz + 37,531,537,779 instructions # 2.52 insn per cycle + 4.962426661 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.486864e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.501204e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.501204e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.938204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.954119e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.954119e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.200274 sec - 6,167,482,001 cycles # 2.799 GHz - 13,061,737,270 instructions # 2.12 insn per cycle - 2.204748277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) +TOTAL : 2.076037 sec + 6,032,491,286 cycles # 2.901 GHz + 12,947,967,770 instructions # 2.15 insn per cycle + 2.080892058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.239505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.261064e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.261064e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.541807e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.563199e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.563199e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.784853 sec - 5,062,751,449 cycles # 2.830 GHz - 11,440,329,026 instructions # 2.26 insn per cycle - 1.789370916 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) +TOTAL : 1.728667 sec + 5,007,253,350 cycles # 2.890 GHz + 11,363,228,557 instructions # 2.27 insn per cycle + 1.733486835 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.593982e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.608272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.608272e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.060598e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.076855e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.076855e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.169483 sec - 4,001,360,531 cycles # 1.841 GHz - 5,942,704,084 instructions # 1.49 insn per cycle - 2.173913867 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) +TOTAL : 2.044567 sec + 3,894,310,656 cycles # 1.901 GHz + 5,853,526,554 instructions # 1.50 insn per cycle + 2.049442336 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 3babb1df02..3f81898e30 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:03:26 +DATE: 2024-01-25_23:45:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.228708e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.253927e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.255914e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.225970e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.250939e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252804e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.533504 sec - 2,263,207,731 cycles # 2.940 GHz - 3,466,881,858 instructions # 1.53 insn per cycle - 0.826915946 seconds time elapsed +TOTAL : 0.527705 sec + 2,271,360,893 cycles # 3.011 GHz + 3,483,071,698 instructions # 1.53 insn per cycle + 0.811309280 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.789640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.817455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818645e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.787575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.816403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.817588e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.275326 sec - 10,593,520,447 cycles # 2.998 GHz - 22,594,076,221 instructions # 2.13 insn per cycle - 3.592953181 seconds time elapsed +TOTAL : 3.294681 sec + 10,793,582,886 cycles # 3.053 GHz + 24,009,551,599 instructions # 2.22 insn per cycle + 3.606172358 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.301382e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.301861e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.301861e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.434610e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.435103e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.435103e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.148296 sec - 114,295,626,783 cycles # 2.997 GHz - 145,697,398,014 instructions # 1.27 insn per cycle - 38.152633004 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 36.995548 sec + 113,902,374,231 cycles # 3.079 GHz + 141,701,166,601 instructions # 1.24 insn per cycle + 37.000316422 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.078026e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080354e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080354e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.309979e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312621e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.312621e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.337092 sec - 15,158,837,201 cycles # 2.838 GHz - 37,762,768,502 instructions # 2.49 insn per cycle - 5.341471006 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.964493 sec + 14,906,936,554 cycles # 3.001 GHz + 37,593,495,183 instructions # 2.52 insn per cycle + 4.969424279 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.680771e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.695420e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.695420e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.063731e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.079641e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.079641e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.144872 sec - 6,018,410,000 cycles # 2.801 GHz - 12,896,129,377 instructions # 2.14 insn per cycle - 2.149367936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) +TOTAL : 2.043547 sec + 5,936,293,527 cycles # 2.899 GHz + 12,831,131,379 instructions # 2.16 insn per cycle + 2.048529346 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.099517e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119694e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119694e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.611190e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.634726e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.634726e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.811911 sec - 5,092,083,515 cycles # 2.805 GHz - 11,446,810,123 instructions # 2.25 insn per cycle - 1.816340568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) +TOTAL : 1.715814 sec + 4,981,290,818 cycles # 2.897 GHz + 11,359,270,596 instructions # 2.28 insn per cycle + 1.720629844 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.715726e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.730963e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.730963e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.987877e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.004071e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.004071e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.135403 sec - 3,949,503,348 cycles # 1.847 GHz - 5,897,146,057 instructions # 1.49 insn per cycle - 2.139763572 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) +TOTAL : 2.062896 sec + 3,892,536,672 cycles # 1.884 GHz + 5,843,276,595 instructions # 1.50 insn per cycle + 2.067877562 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 9acfc6188d..169ba41d04 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:37:38 +DATE: 2024-01-25_23:10:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.349743e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.400970e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.406599e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.321241e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.384306e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.390636e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.479034 sec - 2,079,658,422 cycles # 2.994 GHz - 3,036,293,369 instructions # 1.46 insn per cycle - 0.774778080 seconds time elapsed +TOTAL : 0.478569 sec + 2,081,351,055 cycles # 2.995 GHz + 3,104,629,593 instructions # 1.49 insn per cycle + 0.781758350 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.518158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.591813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.595087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.504629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.593370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.597094e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.721697 sec - 5,967,810,947 cycles # 3.055 GHz - 12,572,582,180 instructions # 2.11 insn per cycle - 2.009764828 seconds time elapsed +TOTAL : 1.724357 sec + 6,014,180,976 cycles # 3.072 GHz + 12,790,134,988 instructions # 2.13 insn per cycle + 2.014643886 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,128 +86,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.064583e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.065690e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.065690e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.098989e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.100057e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.100057e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.952623 sec - 24,634,175,599 cycles # 3.097 GHz - 78,128,580,676 instructions # 3.17 insn per cycle - 7.959267887 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.822875 sec + 24,208,205,325 cycles # 3.093 GHz + 75,876,008,404 instructions # 3.13 insn per cycle + 7.829655847 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.270538e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.283405e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.283405e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.608318e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.622050e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.622050e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.264433 sec - 6,463,017,562 cycles # 2.850 GHz - 20,121,780,291 instructions # 3.11 insn per cycle - 2.276060029 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.165057 sec + 6,486,423,740 cycles # 2.992 GHz + 20,116,074,983 instructions # 3.10 insn per cycle + 2.179423883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637995e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.644543e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.644543e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.707797e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.714735e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.714735e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.009271 sec - 2,836,657,749 cycles # 2.799 GHz - 6,989,190,074 instructions # 2.46 insn per cycle - 1.021190194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 0.968913 sec + 2,827,321,383 cycles # 2.906 GHz + 7,038,162,946 instructions # 2.49 insn per cycle + 0.982725511 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.933245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.942230e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.942230e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.948915e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957872e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957872e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.856055 sec - 2,489,722,611 cycles # 2.894 GHz - 6,296,795,301 instructions # 2.53 insn per cycle - 0.871292854 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.850636 sec + 2,475,963,051 cycles # 2.899 GHz + 6,280,012,775 instructions # 2.54 insn per cycle + 0.866712833 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.556155e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.561846e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.561846e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.572704e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.578550e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.578550e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.062224 sec - 2,048,930,683 cycles # 1.922 GHz - 3,267,038,899 instructions # 1.59 insn per cycle - 1.073737564 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.053199 sec + 2,034,931,877 cycles # 1.927 GHz + 3,248,490,915 instructions # 1.60 insn per cycle + 1.063051290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index fa5a863bc1..4a07905533 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:13:56 +DATE: 2024-01-25_23:56:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.632667e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.322906e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.322906e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.650204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.344236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.344236e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467438 sec - 2,006,072,261 cycles # 2.897 GHz - 2,979,194,260 instructions # 1.49 insn per cycle - 0.749996630 seconds time elapsed +TOTAL : 0.464619 sec + 2,032,299,547 cycles # 3.010 GHz + 3,040,740,721 instructions # 1.50 insn per cycle + 0.733214185 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.176934e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.461971e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.461971e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.245584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.458669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.458669e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.909592 sec - 6,340,042,952 cycles # 2.952 GHz - 13,540,665,643 instructions # 2.14 insn per cycle - 2.204543754 seconds time elapsed +TOTAL : 1.895523 sec + 6,515,109,485 cycles # 3.057 GHz + 13,866,028,547 instructions # 2.13 insn per cycle + 2.190470399 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,132 +99,132 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.018583e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.019594e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.019594e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.080299e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.081327e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.081327e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.135657 sec - 24,647,759,447 cycles # 3.029 GHz - 78,134,979,688 instructions # 3.17 insn per cycle - 8.140016435 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.894694 sec + 24,207,279,883 cycles # 3.068 GHz + 75,883,942,743 instructions # 3.13 insn per cycle + 7.899865359 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.102824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.116231e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.116231e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.558107e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.572924e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.572924e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.319512 sec - 6,477,527,484 cycles # 2.794 GHz - 20,133,025,469 instructions # 3.11 insn per cycle - 2.323961953 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.181134 sec + 6,508,175,197 cycles # 2.979 GHz + 20,123,398,381 instructions # 3.09 insn per cycle + 2.186113667 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.662031e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.669314e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.669314e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.689000e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.696260e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.696260e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.997479 sec - 2,842,471,628 cycles # 2.839 GHz - 6,997,876,677 instructions # 2.46 insn per cycle - 1.001720557 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 0.981991 sec + 2,821,624,681 cycles # 2.862 GHz + 7,046,855,732 instructions # 2.50 insn per cycle + 0.986872343 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.899662e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.908650e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.908650e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.866719e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875603e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875603e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.874017 sec - 2,497,188,263 cycles # 2.845 GHz - 6,305,318,806 instructions # 2.52 insn per cycle - 0.878481296 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.889374 sec + 2,585,713,262 cycles # 2.895 GHz + 6,289,546,575 instructions # 2.43 insn per cycle + 0.894321875 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.510965e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.516631e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.516631e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.551570e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.557637e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557637e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.096117 sec - 2,057,326,282 cycles # 1.871 GHz - 3,276,156,329 instructions # 1.59 insn per cycle - 1.100460049 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.068223 sec + 2,044,230,196 cycles # 1.906 GHz + 3,257,615,420 instructions # 1.59 insn per cycle + 1.073232409 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index cde090527d..ffde6a98bb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:25:47 +DATE: 2024-01-26_00:08:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.341986e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.393716e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.399601e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.325797e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.377030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.382308e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.465286 sec - 1,996,862,139 cycles # 2.938 GHz - 2,998,262,282 instructions # 1.50 insn per cycle - 0.737241303 seconds time elapsed +TOTAL : 0.463350 sec + 2,011,391,423 cycles # 2.993 GHz + 3,025,832,528 instructions # 1.50 insn per cycle + 0.731209834 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.562727e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.635286e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.638678e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.551198e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.624622e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.627836e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.800198 sec - 6,027,791,673 cycles # 2.972 GHz - 12,131,730,184 instructions # 2.01 insn per cycle - 2.091221198 seconds time elapsed +TOTAL : 1.798766 sec + 6,178,989,401 cycles # 3.055 GHz + 12,447,116,980 instructions # 2.01 insn per cycle + 2.088422920 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,128 +86,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.988613e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.989604e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.989604e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.257430 sec - 24,655,829,370 cycles # 2.985 GHz - 78,128,720,285 instructions # 3.17 insn per cycle - 8.261783085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.079376e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.080406e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080406e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 7.897135 sec + 24,196,179,399 cycles # 3.063 GHz + 75,879,578,983 instructions # 3.14 insn per cycle + 7.902000758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.090825e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.103638e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.103638e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.321920 sec - 6,463,785,266 cycles # 2.780 GHz - 20,119,008,709 instructions # 3.11 insn per cycle - 2.326112409 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.608061e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.622592e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.622592e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 2.164790 sec + 6,497,805,412 cycles # 2.997 GHz + 20,112,493,788 instructions # 3.10 insn per cycle + 2.169281994 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.643399e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.650724e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.650724e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.007824 sec - 2,846,106,580 cycles # 2.814 GHz - 6,988,034,221 instructions # 2.46 insn per cycle - 1.012119516 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.717181e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.724647e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.724647e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.963993 sec + 2,817,463,010 cycles # 2.912 GHz + 7,034,615,846 instructions # 2.50 insn per cycle + 0.968756498 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.842696e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.851591e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.851591e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.900263 sec - 2,494,778,530 cycles # 2.760 GHz - 6,293,767,609 instructions # 2.52 insn per cycle - 0.904816892 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.938377e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.947760e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.947760e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.856285 sec + 2,482,222,517 cycles # 2.887 GHz + 6,277,367,771 instructions # 2.53 insn per cycle + 0.860711348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.422519e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.427987e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.427987e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.566131e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.572297e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.572297e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.162978 sec - 2,054,015,790 cycles # 1.761 GHz - 3,264,160,600 instructions # 1.59 insn per cycle - 1.167204725 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.056318 sec + 2,035,165,266 cycles # 1.920 GHz + 3,243,579,392 instructions # 1.59 insn per cycle + 1.060894396 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 3a3eb3caf7..b15d42b6e4 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:22:26 +DATE: 2024-01-26_00:04:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.336723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.390423e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.395725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.331075e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388445e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.393715e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.463745 sec - 1,978,530,916 cycles # 2.937 GHz - 2,900,092,077 instructions # 1.47 insn per cycle - 0.732414298 seconds time elapsed +TOTAL : 0.460547 sec + 2,003,713,802 cycles # 2.992 GHz + 2,989,875,393 instructions # 1.49 insn per cycle + 0.728329211 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.551039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.623038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626383e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.563961e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.637851e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.641070e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.755684 sec - 5,914,588,617 cycles # 2.977 GHz - 12,946,533,243 instructions # 2.19 insn per cycle - 2.043185741 seconds time elapsed +TOTAL : 1.744057 sec + 6,014,413,547 cycles # 3.054 GHz + 13,039,651,018 instructions # 2.17 insn per cycle + 2.026578738 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,128 +86,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.016502e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.016502e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.079528e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.080547e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.080547e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.145261 sec - 24,654,200,089 cycles # 3.026 GHz - 78,126,847,348 instructions # 3.17 insn per cycle - 8.149417178 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.898001 sec + 24,218,623,224 cycles # 3.066 GHz + 75,876,649,489 instructions # 3.13 insn per cycle + 7.902833708 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.056381e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.069788e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.069788e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.357606e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.371431e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.371431e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.332327 sec - 6,459,567,003 cycles # 2.766 GHz - 20,120,739,754 instructions # 3.11 insn per cycle - 2.336953548 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.237158 sec + 6,497,866,331 cycles # 2.900 GHz + 20,114,366,949 instructions # 3.10 insn per cycle + 2.241831657 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641184e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641184e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.707623e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.714979e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.714979e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.011258 sec - 2,838,489,472 cycles # 2.797 GHz - 6,988,461,226 instructions # 2.46 insn per cycle - 1.015381469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 0.968582 sec + 2,812,838,354 cycles # 2.893 GHz + 7,037,078,857 instructions # 2.50 insn per cycle + 0.973180280 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.722994e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.730625e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.730625e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.914327e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.923385e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.923385e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.960124 sec - 2,489,397,951 cycles # 2.584 GHz - 6,296,004,783 instructions # 2.53 insn per cycle - 0.964386434 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.865359 sec + 2,481,582,764 cycles # 2.857 GHz + 6,280,640,305 instructions # 2.53 insn per cycle + 0.870009069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.483765e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.489132e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.489132e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.556930e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.563015e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.563015e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.113558 sec - 2,047,316,573 cycles # 1.833 GHz - 3,265,815,382 instructions # 1.60 insn per cycle - 1.117668257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.061662 sec + 2,034,090,458 cycles # 1.910 GHz + 3,247,290,037 instructions # 1.60 insn per cycle + 1.066231690 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 474faabf1c..c1d0387cb2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:19:08 +DATE: 2024-01-26_00:01:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.719331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.371583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.756369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.376369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.382275e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.465990 sec - 1,978,008,980 cycles # 2.919 GHz - 2,922,355,478 instructions # 1.48 insn per cycle - 0.734927742 seconds time elapsed +TOTAL : 0.463063 sec + 2,022,094,477 cycles # 3.012 GHz + 3,000,110,742 instructions # 1.48 insn per cycle + 0.730659620 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -68,17 +68,17 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.466647e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.638185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.641583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.489937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.624029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.627434e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.827648 sec - 6,079,488,382 cycles # 2.960 GHz - 11,657,968,792 instructions # 1.92 insn per cycle - 2.117399258 seconds time elapsed +TOTAL : 1.821843 sec + 6,284,899,485 cycles # 3.070 GHz + 12,329,882,702 instructions # 1.96 insn per cycle + 2.114503584 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -89,128 +89,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.995150e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.996157e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996157e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.076757e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.077790e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.077790e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.229536 sec - 24,636,759,314 cycles # 2.993 GHz - 78,130,437,860 instructions # 3.17 insn per cycle - 8.233776212 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.905540 sec + 24,208,958,642 cycles # 3.061 GHz + 75,876,487,136 instructions # 3.13 insn per cycle + 7.910104097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.265246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.278746e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.278746e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.331082e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.344446e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.344446e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.265483 sec - 6,468,059,761 cycles # 2.851 GHz - 20,120,655,203 instructions # 3.11 insn per cycle - 2.269729053 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.244902 sec + 6,498,402,067 cycles # 2.890 GHz + 20,114,181,970 instructions # 3.10 insn per cycle + 2.249807684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.626140e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632839e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.632839e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.700366e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.707524e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.707524e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.016729 sec - 2,837,492,476 cycles # 2.781 GHz - 6,988,268,106 instructions # 2.46 insn per cycle - 1.020920647 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 0.972557 sec + 2,812,510,471 cycles # 2.881 GHz + 7,036,988,126 instructions # 2.50 insn per cycle + 0.977084151 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.871366e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880395e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880395e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.940115e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949570e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949570e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.884395 sec - 2,487,612,796 cycles # 2.801 GHz - 6,295,453,513 instructions # 2.53 insn per cycle - 0.888549870 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.853172 sec + 2,477,642,132 cycles # 2.892 GHz + 6,279,053,352 instructions # 2.53 insn per cycle + 0.857874216 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.476534e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.482059e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.482059e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.554146e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.560290e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.560290e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.119059 sec - 2,048,523,522 cycles # 1.825 GHz - 3,265,858,129 instructions # 1.59 insn per cycle - 1.123246679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.063738 sec + 2,032,569,225 cycles # 1.904 GHz + 3,247,369,313 instructions # 1.60 insn per cycle + 1.068352560 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -218,8 +218,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 8db08c40cb..343059fb0d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:38:07 +DATE: 2024-01-25_23:10:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.342222e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.395573e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401305e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.293388e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.353811e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.360577e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.475377 sec - 2,071,036,372 cycles # 2.995 GHz - 3,032,826,911 instructions # 1.46 insn per cycle - 0.762514427 seconds time elapsed +TOTAL : 0.482578 sec + 2,126,384,282 cycles # 2.996 GHz + 3,122,872,646 instructions # 1.47 insn per cycle + 0.797709837 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.503453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.577079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.580179e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.535930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.625556e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.629357e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.721929 sec - 5,978,281,541 cycles # 3.061 GHz - 12,363,091,331 instructions # 2.07 insn per cycle - 2.009961043 seconds time elapsed +TOTAL : 1.724570 sec + 5,989,683,103 cycles # 3.065 GHz + 11,335,601,102 instructions # 1.89 insn per cycle + 2.013935385 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.071210e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.072246e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.072246e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.101149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.102221e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.102221e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.926211 sec - 24,590,863,046 cycles # 3.102 GHz - 77,854,642,024 instructions # 3.17 insn per cycle - 7.932789655 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.814813 sec + 24,218,095,116 cycles # 3.098 GHz + 75,804,256,539 instructions # 3.13 insn per cycle + 7.821824558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,107 +107,107 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866268634797E-004 -Relative difference = 5.630135835748959e-08 +Avg ME (F77/C++) = 6.6274870430095556E-004 +Relative difference = 6.489572191632735e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.597544e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.611358e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.611358e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.575201e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.590213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.590213e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.166211 sec - 6,421,975,036 cycles # 2.959 GHz - 20,086,123,706 instructions # 3.13 insn per cycle - 2.181924706 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.176296 sec + 6,498,546,102 cycles # 2.983 GHz + 20,110,977,060 instructions # 3.09 insn per cycle + 2.186978505 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861465384638E-004 -Relative difference = 2.211071647257023e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634914e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641813e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.705454e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.712850e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.712850e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.012157 sec - 2,920,524,129 cycles # 2.875 GHz - 7,130,900,907 instructions # 2.44 insn per cycle - 1.024052491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) +TOTAL : 0.969758 sec + 2,811,746,943 cycles # 2.886 GHz + 7,037,673,997 instructions # 2.50 insn per cycle + 0.981392691 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.776439e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.783925e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.938719e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.947561e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.947561e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.931369 sec - 2,592,272,120 cycles # 2.772 GHz - 6,439,655,252 instructions # 2.48 insn per cycle - 0.940797030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) +TOTAL : 0.855569 sec + 2,473,527,129 cycles # 2.880 GHz + 6,280,128,164 instructions # 2.54 insn per cycle + 0.868118493 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.511530e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.517127e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.517127e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.561075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.567076e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.567076e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.092797 sec - 2,119,842,752 cycles # 1.933 GHz - 3,428,582,326 instructions # 1.62 insn per cycle - 1.103781056 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) +TOTAL : 1.058909 sec + 2,035,847,836 cycles # 1.915 GHz + 3,247,475,409 instructions # 1.60 insn per cycle + 1.072714857 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032322112E-004 -Relative difference = 3.066639970473621e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 7e38ef2c7b..89d748e060 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:04:36 +DATE: 2024-01-25_23:47:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.586198e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.628840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.633485e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.574102e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.615260e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.619584e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.485828 sec - 2,082,184,633 cycles # 2.938 GHz - 3,131,974,897 instructions # 1.50 insn per cycle - 0.768933691 seconds time elapsed +TOTAL : 0.484525 sec + 2,126,112,707 cycles # 3.011 GHz + 3,203,427,705 instructions # 1.51 insn per cycle + 0.766056742 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.739364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.800371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.802973e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.692622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.752719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.755365e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.856072 sec - 6,264,858,693 cycles # 2.996 GHz - 12,861,475,785 instructions # 2.05 insn per cycle - 2.150989164 seconds time elapsed +TOTAL : 1.852475 sec + 6,402,505,000 cycles # 3.066 GHz + 13,202,162,532 instructions # 2.06 insn per cycle + 2.145533834 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -86,47 +86,47 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.664072e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.664878e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.664878e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.963182 sec - 87,150,309,684 cycles # 3.009 GHz - 135,629,504,187 instructions # 1.56 insn per cycle - 28.967350767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.858562e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.859395e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.859395e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.002108 sec + 86,058,643,411 cycles # 3.074 GHz + 133,995,703,913 instructions # 1.56 insn per cycle + 28.007044175 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340277317796E-004 -Relative difference = 4.184328521943034e-09 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275354356437610E-004 +Relative difference = 6.573239683366044e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.045716e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.058481e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.058481e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.336070 sec - 6,777,659,542 cycles # 2.897 GHz - 19,385,758,717 instructions # 2.86 insn per cycle - 2.340385269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.350588e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.364586e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.364586e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.243608 sec + 6,715,678,750 cycles # 2.991 GHz + 19,163,488,382 instructions # 2.85 insn per cycle + 2.248422734 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,80 +134,80 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862707273868E-004 -Relative difference = 4.0849182767952624e-08 +Avg ME (F77/C++) = 6.6274859783433532E-004 +Relative difference = 3.2677016209485094e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.472442e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.477968e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.477968e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.522127e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.527994e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.527994e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.122029 sec - 3,176,939,727 cycles # 2.822 GHz - 6,808,012,735 instructions # 2.14 insn per cycle - 1.126367927 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) +TOTAL : 1.086187 sec + 3,143,488,591 cycles # 2.883 GHz + 6,746,651,283 instructions # 2.15 insn per cycle + 1.091333600 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.771353e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.779044e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.779044e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.848844e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.857285e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.857285e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.933958 sec - 2,648,978,339 cycles # 2.826 GHz - 5,986,332,919 instructions # 2.26 insn per cycle - 0.938142599 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) +TOTAL : 0.895639 sec + 2,605,116,969 cycles # 2.896 GHz + 5,931,025,046 instructions # 2.28 insn per cycle + 0.900480284 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.477195e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.482900e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.482900e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.118598 sec - 2,077,867,589 cycles # 1.852 GHz - 3,500,933,885 instructions # 1.68 insn per cycle - 1.122888926 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) +EvtsPerSec[Rmb+ME] (23) = ( 1.544175e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.550163e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.550163e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.070662 sec + 2,050,751,478 cycles # 1.909 GHz + 3,435,534,795 instructions # 1.68 insn per cycle + 1.075337528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750363879224E-004 -Relative difference = 5.490631193034436e-09 +Avg ME (F77/C++) = 6.6272748295826550E-004 +Relative difference = 2.5714542480216212e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index be288b051a..2cddbacf89 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:05:29 +DATE: 2024-01-25_23:47:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.522303e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.562037e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.566663e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.556980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.597321e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.601884e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487689 sec - 2,083,511,350 cycles # 2.934 GHz - 3,089,371,073 instructions # 1.48 insn per cycle - 0.770457246 seconds time elapsed +TOTAL : 0.484078 sec + 2,121,900,012 cycles # 3.005 GHz + 3,151,336,456 instructions # 1.49 insn per cycle + 0.765054667 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.692823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.753155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.755861e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.644272e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.702490e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.705102e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.857687 sec - 6,225,343,404 cycles # 2.971 GHz - 13,475,402,986 instructions # 2.16 insn per cycle - 2.151878134 seconds time elapsed +TOTAL : 1.859096 sec + 6,341,012,252 cycles # 3.030 GHz + 12,731,535,061 instructions # 2.01 insn per cycle + 2.152397231 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -86,47 +86,47 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.702624e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.703436e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.703436e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.768124 sec - 86,390,205,800 cycles # 3.004 GHz - 135,908,144,933 instructions # 1.57 insn per cycle - 28.772428929 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.881844e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.882693e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.882693e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 27.890812 sec + 86,057,677,015 cycles # 3.086 GHz + 134,115,301,052 instructions # 1.56 insn per cycle + 27.895583882 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275352674967369E-004 -Relative difference = 4.0361421941458736e-08 +Avg ME (C++/C++) = 6.627536e-04 +Avg ME (F77/C++) = 6.6275357377482830E-004 +Relative difference = 3.95700176737784e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.022155e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.035059e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.035059e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.343932 sec - 6,861,828,247 cycles # 2.923 GHz - 19,438,837,198 instructions # 2.83 insn per cycle - 2.348614161 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.177991e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.191361e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.191361e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.293306 sec + 6,709,038,006 cycles # 2.921 GHz + 19,223,663,179 instructions # 2.87 insn per cycle + 2.298517207 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,80 +134,80 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862764021530E-004 -Relative difference = 4.170542995014107e-08 +Avg ME (F77/C++) = 6.6274859765498573E-004 +Relative difference = 3.538316437387639e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.496126e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.501849e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.501849e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.557084e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.563309e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.563309e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.104423 sec - 3,132,664,979 cycles # 2.827 GHz - 6,718,780,084 instructions # 2.14 insn per cycle - 1.108980849 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) +TOTAL : 1.061780 sec + 3,077,526,851 cycles # 2.889 GHz + 6,686,174,538 instructions # 2.17 insn per cycle + 1.066609744 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.776930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784946e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.784946e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.838034e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.846640e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.846640e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.932427 sec - 2,628,346,991 cycles # 2.809 GHz - 5,969,517,035 instructions # 2.27 insn per cycle - 0.937063186 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) +TOTAL : 0.900730 sec + 2,602,110,456 cycles # 2.877 GHz + 5,935,535,699 instructions # 2.28 insn per cycle + 0.905541318 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.476207e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.481770e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.481770e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.119825 sec - 2,078,177,152 cycles # 1.850 GHz - 3,494,307,916 instructions # 1.68 insn per cycle - 1.124539671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) +EvtsPerSec[Rmb+ME] (23) = ( 1.543440e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.549672e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.549672e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.071317 sec + 2,054,664,150 cycles # 1.911 GHz + 3,422,654,664 instructions # 1.67 insn per cycle + 1.076197262 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750384530066E-004 -Relative difference = 5.80223501432476e-09 +Avg ME (F77/C++) = 6.6272749650985591E-004 +Relative difference = 5.26633351741962e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 5f7b3a9875..1ce895c12a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:38:36 +DATE: 2024-01-25_23:11:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.457884e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487559e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.511599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.546369e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548884e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520964 sec - 2,264,629,245 cycles # 3.014 GHz - 3,506,769,468 instructions # 1.55 insn per cycle - 0.829035526 seconds time elapsed +TOTAL : 0.520763 sec + 2,268,786,464 cycles # 3.011 GHz + 3,443,527,857 instructions # 1.52 insn per cycle + 0.825127284 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.121692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155469e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.156924e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.111766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.145403e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146782e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.040169 sec - 10,113,736,424 cycles # 3.067 GHz - 21,679,010,671 instructions # 2.14 insn per cycle - 3.353783770 seconds time elapsed +TOTAL : 3.044517 sec + 9,929,651,043 cycles # 3.007 GHz + 21,357,445,786 instructions # 2.15 insn per cycle + 3.358909765 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.970949e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.971967e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.971967e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.891705e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.892598e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.892598e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.331676 sec - 25,844,448,081 cycles # 3.101 GHz - 79,435,783,122 instructions # 3.07 insn per cycle - 8.337986155 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.680046 sec + 26,811,670,006 cycles # 3.088 GHz + 82,457,346,695 instructions # 3.08 insn per cycle + 8.687362012 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.738000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.741428e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.741428e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.768742e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.772112e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.772112e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.396976 sec - 12,662,050,046 cycles # 2.877 GHz - 38,549,909,567 instructions # 3.04 insn per cycle - 4.410716297 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.361579 sec + 12,624,387,353 cycles # 2.892 GHz + 38,536,661,841 instructions # 3.05 insn per cycle + 4.376083454 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.572381e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.589867e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.589867e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.633153e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651561e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.922338 sec - 5,514,460,304 cycles # 2.862 GHz - 13,481,024,869 instructions # 2.44 insn per cycle - 1.938160725 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) +TOTAL : 1.909280 sec + 5,542,414,967 cycles # 2.896 GHz + 13,582,418,288 instructions # 2.45 insn per cycle + 1.923129371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.241393e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.262450e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.262450e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.711541e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.734298e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.734298e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.784180 sec - 4,876,372,587 cycles # 2.727 GHz - 12,135,890,910 instructions # 2.49 insn per cycle - 1.800822098 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) +TOTAL : 1.699762 sec + 4,836,144,559 cycles # 2.839 GHz + 12,109,325,012 instructions # 2.50 insn per cycle + 1.712753610 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.118322e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.130873e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.130873e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.559990e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.574675e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.574675e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.314160 sec - 4,150,527,960 cycles # 1.791 GHz - 6,337,492,716 instructions # 1.53 insn per cycle - 2.327321030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) +TOTAL : 2.179561 sec + 4,095,330,827 cycles # 1.875 GHz + 6,282,635,897 instructions # 1.53 insn per cycle + 2.192150470 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index d8b5e539f7..0c49affce5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:39:13 +DATE: 2024-01-25_23:12:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.493159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521560e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.523711e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.483687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.517313e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.520337e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520820 sec - 2,242,158,202 cycles # 2.985 GHz - 3,540,678,032 instructions # 1.58 insn per cycle - 0.821178917 seconds time elapsed +TOTAL : 0.518686 sec + 2,258,260,247 cycles # 3.007 GHz + 3,497,479,611 instructions # 1.55 insn per cycle + 0.820930555 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.120115e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.154154e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155531e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.145573e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179849e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.181256e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.032729 sec - 10,102,528,341 cycles # 3.074 GHz - 22,682,645,762 instructions # 2.25 insn per cycle - 3.344844132 seconds time elapsed +TOTAL : 3.021058 sec + 10,039,919,108 cycles # 3.066 GHz + 22,491,054,377 instructions # 2.24 insn per cycle + 3.334343751 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.963020e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.963993e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.963993e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.894529e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.895406e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.895406e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.365135 sec - 25,986,900,256 cycles # 3.106 GHz - 79,446,545,297 instructions # 3.06 insn per cycle - 8.371739931 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.670860 sec + 26,766,644,461 cycles # 3.087 GHz + 82,359,315,759 instructions # 3.08 insn per cycle + 8.678080922 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.587282e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.590429e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.590429e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.760878e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.764183e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.764183e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.581482 sec - 12,652,204,532 cycles # 2.760 GHz - 38,520,894,837 instructions # 3.04 insn per cycle - 4.597176677 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.371292 sec + 12,657,966,246 cycles # 2.894 GHz + 38,556,895,723 instructions # 3.05 insn per cycle + 4.383442789 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.252221e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.267695e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.267695e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.688710e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.706749e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.706749e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.997543 sec - 5,563,118,159 cycles # 2.781 GHz - 13,606,463,107 instructions # 2.45 insn per cycle - 2.009289653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) +TOTAL : 1.897561 sec + 5,506,035,803 cycles # 2.895 GHz + 13,595,753,802 instructions # 2.47 insn per cycle + 1.914923405 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.355631e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.376642e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.376642e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.892225e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.917853e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.917853e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.762069 sec - 4,918,440,362 cycles # 2.785 GHz - 12,270,891,194 instructions # 2.49 insn per cycle - 1.776725149 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) +TOTAL : 1.667514 sec + 4,829,490,829 cycles # 2.889 GHz + 12,121,531,237 instructions # 2.51 insn per cycle + 1.679762918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.144291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.157521e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157521e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.714105e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.727872e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.727872e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.305102 sec - 4,151,951,816 cycles # 1.798 GHz - 6,443,598,853 instructions # 1.55 insn per cycle - 2.315841771 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) +TOTAL : 2.135418 sec + 4,088,243,987 cycles # 1.911 GHz + 6,289,128,463 instructions # 1.54 insn per cycle + 2.153209406 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 8be70208e9..9bd9f58817 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:41:33 +DATE: 2024-01-25_23:14:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.061812e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.062202e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.062394e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.065787e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.066184e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066303e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.449920 sec - 8,136,633,317 cycles # 2.967 GHz - 17,299,512,986 instructions # 2.13 insn per cycle - 2.841850939 seconds time elapsed +TOTAL : 2.452875 sec + 8,393,388,566 cycles # 3.065 GHz + 18,463,733,578 instructions # 2.20 insn per cycle + 2.847470239 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.240506e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.242687e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.242958e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.253046e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.255279e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.255484e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.988007 sec - 13,311,291,301 cycles # 3.086 GHz - 31,141,162,586 instructions # 2.34 insn per cycle - 4.368862794 seconds time elapsed +TOTAL : 3.992694 sec + 13,259,807,293 cycles # 3.074 GHz + 28,358,770,018 instructions # 2.14 insn per cycle + 4.372097905 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.074588e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.074809e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.074809e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.459684e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.459918e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.459918e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.548778 sec - 18,889,945,267 cycles # 2.886 GHz - 53,916,181,025 instructions # 2.85 insn per cycle - 6.559444585 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.248663 sec + 18,988,584,595 cycles # 3.040 GHz + 55,181,486,093 instructions # 2.91 insn per cycle + 6.255177410 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664916e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.665007e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.665007e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.644797e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.644886e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.644886e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.179365 sec - 9,817,296,962 cycles # 3.087 GHz - 27,093,187,078 instructions # 2.76 insn per cycle - 3.190791457 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.219465 sec + 9,800,820,480 cycles # 3.043 GHz + 27,056,469,438 instructions # 2.76 insn per cycle + 3.238554302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.619576e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.620013e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.620013e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.604560e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.605011e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.605011e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.467898 sec - 4,254,332,720 cycles # 2.899 GHz - 9,562,072,100 instructions # 2.25 insn per cycle - 1.478868560 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +TOTAL : 1.472881 sec + 4,237,416,434 cycles # 2.874 GHz + 9,565,677,228 instructions # 2.26 insn per cycle + 1.486737544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.155197e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155766e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155766e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.144612e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.145182e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.145182e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.277897 sec - 3,721,805,443 cycles # 2.907 GHz - 8,486,279,618 instructions # 2.28 insn per cycle - 1.292716754 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) +TOTAL : 1.281750 sec + 3,691,573,169 cycles # 2.879 GHz + 8,451,507,089 instructions # 2.29 insn per cycle + 1.295790607 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.734112e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.734785e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.734785e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.753048e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.753610e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.753610e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.420502 sec - 2,698,172,498 cycles # 1.898 GHz - 4,274,439,269 instructions # 1.58 insn per cycle - 1.431694585 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +TOTAL : 1.417368 sec + 2,687,064,932 cycles # 1.895 GHz + 4,249,631,483 instructions # 1.58 insn per cycle + 1.428587401 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index cc408356f3..d14df13526 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_15:14:25 +DATE: 2024-01-25_23:56:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059911e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060821e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060821e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.066232e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.067169e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.067169e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.369780 sec - 8,074,775,034 cycles # 3.005 GHz - 17,556,507,233 instructions # 2.17 insn per cycle - 2.745666195 seconds time elapsed +TOTAL : 2.360921 sec + 8,190,424,340 cycles # 3.057 GHz + 18,278,004,261 instructions # 2.23 insn per cycle + 2.736136962 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.206968e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.239855e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.239855e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.223953e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.258483e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.258483e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.981224 sec - 12,936,091,202 cycles # 3.008 GHz - 29,929,520,175 instructions # 2.31 insn per cycle - 4.359803213 seconds time elapsed +TOTAL : 3.978912 sec + 13,124,273,934 cycles # 3.053 GHz + 28,270,976,701 instructions # 2.15 insn per cycle + 4.357372054 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.880447e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.880664e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.880664e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.439754e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.439992e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.439992e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.704716 sec - 18,908,384,031 cycles # 2.819 GHz - 53,916,851,861 instructions # 2.85 insn per cycle - 6.708873310 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.260228 sec + 18,994,991,555 cycles # 3.033 GHz + 55,179,721,591 instructions # 2.90 insn per cycle + 6.264831927 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -127,20 +127,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.623567e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.623659e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.623659e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.667946e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.668036e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.668036e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.257235 sec - 9,753,060,796 cycles # 2.991 GHz - 27,093,339,492 instructions # 2.78 insn per cycle - 3.261809851 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.170506 sec + 9,795,795,334 cycles # 3.086 GHz + 27,055,731,512 instructions # 2.76 insn per cycle + 3.175238264 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.503641e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504075e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504075e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.606417e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.606863e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.606863e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.512954 sec - 4,263,837,997 cycles # 2.812 GHz - 9,562,348,938 instructions # 2.24 insn per cycle - 1.517455505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +TOTAL : 1.470037 sec + 4,243,836,214 cycles # 2.880 GHz + 9,565,087,053 instructions # 2.25 insn per cycle + 1.474653489 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -183,20 +183,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.030228e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.030821e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.030821e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.145885e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.146454e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146454e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.317705 sec - 3,745,717,606 cycles # 2.836 GHz - 8,485,950,351 instructions # 2.27 insn per cycle - 1.322163213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) +TOTAL : 1.280022 sec + 3,687,476,774 cycles # 2.873 GHz + 8,450,693,643 instructions # 2.29 insn per cycle + 1.284626579 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -211,20 +211,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.640278e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.640943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.640943e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.747174e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.747808e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.747808e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.457064 sec - 2,701,126,575 cycles # 1.849 GHz - 4,273,675,572 instructions # 1.58 insn per cycle - 1.461485660 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +TOTAL : 1.414790 sec + 2,682,388,196 cycles # 1.891 GHz + 4,248,704,675 instructions # 1.58 insn per cycle + 1.419412677 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 5875d438d6..b9e0b80718 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:42:37 +DATE: 2024-01-25_23:15:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059999e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060392e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060490e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.069962e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.070348e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.070466e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.458037 sec - 8,400,321,652 cycles # 3.063 GHz - 17,432,347,823 instructions # 2.08 insn per cycle - 2.856914697 seconds time elapsed +TOTAL : 2.452966 sec + 8,392,911,107 cycles # 3.064 GHz + 17,663,923,974 instructions # 2.10 insn per cycle + 2.847032927 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.242262e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.244491e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.244679e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.276690e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.279063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.279281e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.000355 sec - 13,346,085,971 cycles # 3.087 GHz - 30,375,563,442 instructions # 2.28 insn per cycle - 4.381330754 seconds time elapsed +TOTAL : 3.985081 sec + 13,287,476,031 cycles # 3.079 GHz + 30,163,648,592 instructions # 2.27 insn per cycle + 4.374483515 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.428708e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.428938e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.428938e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.571320e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.571572e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.571572e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.266392 sec - 18,760,509,477 cycles # 2.994 GHz - 53,924,653,614 instructions # 2.87 insn per cycle - 6.272821275 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.171909 sec + 18,889,408,426 cycles # 3.061 GHz + 55,158,086,845 instructions # 2.92 insn per cycle + 6.178617455 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667610e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.667700e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.667700e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.672519e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.672641e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.672641e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.175383 sec - 9,846,531,791 cycles # 3.102 GHz - 27,090,667,624 instructions # 2.75 insn per cycle - 3.186565682 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.167587 sec + 9,791,706,585 cycles # 3.092 GHz + 27,064,728,203 instructions # 2.76 insn per cycle + 3.180382611 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.613188e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613622e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.613622e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.614879e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.615320e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.615320e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.470967 sec - 4,269,806,021 cycles # 2.905 GHz - 9,562,364,627 instructions # 2.24 insn per cycle - 1.483852740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) +TOTAL : 1.471483 sec + 4,258,635,939 cycles # 2.892 GHz + 9,569,764,673 instructions # 2.25 insn per cycle + 1.483574248 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.136889e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.137474e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.137474e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.103182e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.103803e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.103803e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.283162 sec - 3,736,243,344 cycles # 2.908 GHz - 8,485,942,736 instructions # 2.27 insn per cycle - 1.303492183 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) +TOTAL : 1.296986 sec + 3,734,622,853 cycles # 2.878 GHz + 8,454,833,333 instructions # 2.26 insn per cycle + 1.308768086 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.757031e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.757649e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.757649e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.743925e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.744483e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.744483e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.412617 sec - 2,696,840,598 cycles # 1.907 GHz - 4,277,582,192 instructions # 1.59 insn per cycle - 1.424037126 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) +TOTAL : 1.417381 sec + 2,690,766,593 cycles # 1.895 GHz + 4,250,770,661 instructions # 1.58 insn per cycle + 1.429533524 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bdc444302c..8f44531fc7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:43:40 +DATE: 2024-01-25_23:16:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.755368e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.756413e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.756816e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.764615e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.765451e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.765700e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.683293 sec - 5,751,994,986 cycles # 2.961 GHz - 11,825,851,354 instructions # 2.06 insn per cycle - 2.048483718 seconds time elapsed +TOTAL : 1.693480 sec + 5,887,378,168 cycles # 3.045 GHz + 12,371,650,873 instructions # 2.10 insn per cycle + 2.055112195 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.349504e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.350297e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.350390e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.347276e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.348063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.348165e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.907896 sec - 6,721,068,256 cycles # 3.077 GHz - 14,203,232,269 instructions # 2.11 insn per cycle - 2.241134413 seconds time elapsed +TOTAL : 1.911661 sec + 6,533,877,574 cycles # 2.984 GHz + 12,896,572,372 instructions # 1.97 insn per cycle + 2.245605774 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.138313e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.138596e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.138596e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.267727e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.268016e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.268016e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.786618 sec - 17,878,554,532 cycles # 3.089 GHz - 53,588,537,371 instructions # 3.00 insn per cycle - 5.793567200 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.704766 sec + 17,594,191,683 cycles # 3.083 GHz + 51,786,449,538 instructions # 2.94 insn per cycle + 5.712781183 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595673e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.596126e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.596126e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.481350 sec - 4,569,631,683 cycles # 3.086 GHz - 13,762,990,034 instructions # 3.01 insn per cycle - 1.491903170 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.581700e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.582144e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582144e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.482036 sec + 4,536,858,867 cycles # 3.058 GHz + 13,760,139,415 instructions # 3.03 insn per cycle + 1.496097222 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.220064e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.221950e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.221950e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.739476 sec - 2,140,174,105 cycles # 2.889 GHz - 4,817,678,891 instructions # 2.25 insn per cycle - 0.765666592 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.282359e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.284137e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.284137e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.733254 sec + 2,135,927,022 cycles # 2.909 GHz + 4,827,375,326 instructions # 2.26 insn per cycle + 0.746092791 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.208843e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.211063e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.211063e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.650167 sec - 1,891,036,782 cycles # 2.902 GHz - 4,274,768,942 instructions # 2.26 insn per cycle - 0.663284347 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.194354e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.196559e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.196559e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.653785 sec + 1,887,673,749 cycles # 2.887 GHz + 4,259,987,023 instructions # 2.26 insn per cycle + 0.666506032 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,35 +188,35 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.569007e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571564e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.571564e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.710143 sec - 1,360,534,299 cycles # 1.926 GHz - 2,159,690,774 instructions # 1.59 insn per cycle - 0.724501882 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) +EvtsPerSec[Rmb+ME] (23) = ( 7.532413e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.534737e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.534737e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.709782 sec + 1,351,081,905 cycles # 1.902 GHz + 2,148,573,752 instructions # 1.59 insn per cycle + 0.723580289 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index b005f4754c..d30e3f5e8c 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_15:15:29 +DATE: 2024-01-25_23:57:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.768306e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.770186e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.770186e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.804220e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.806054e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.806054e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.618405 sec - 5,667,284,995 cycles # 2.995 GHz - 11,929,859,554 instructions # 2.11 insn per cycle - 1.951873227 seconds time elapsed +TOTAL : 1.599978 sec + 5,690,698,341 cycles # 3.038 GHz + 11,869,605,294 instructions # 2.09 insn per cycle + 1.930855145 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.327456e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.341794e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.341794e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.357820e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.371717e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.371717e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.910757 sec - 6,253,041,183 cycles # 2.847 GHz - 13,520,927,317 instructions # 2.16 insn per cycle - 2.256378130 seconds time elapsed +TOTAL : 1.854511 sec + 6,501,890,517 cycles # 3.055 GHz + 13,932,068,655 instructions # 2.14 insn per cycle + 2.184917483 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.875191e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.875509e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.875509e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.275864e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.276185e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.276185e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.953864 sec - 17,981,915,231 cycles # 3.018 GHz - 53,589,180,868 instructions # 2.98 insn per cycle - 5.958154287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.697274 sec + 17,573,288,349 cycles # 3.083 GHz + 51,786,308,864 instructions # 2.95 insn per cycle + 5.702038900 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.525468e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.525921e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.525921e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.503844 sec - 4,572,076,788 cycles # 3.033 GHz - 13,762,724,678 instructions # 3.01 insn per cycle - 1.508341488 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.423474e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.423884e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.423884e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.547633 sec + 4,547,457,091 cycles # 2.942 GHz + 13,762,480,283 instructions # 3.03 insn per cycle + 1.552657618 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,27 +148,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.089959e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.091818e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.091818e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.750792 sec - 2,139,360,697 cycles # 2.835 GHz - 4,817,322,213 instructions # 2.25 insn per cycle - 0.755255088 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.199242e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.200979e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.200979e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.738658 sec + 2,137,172,645 cycles # 2.879 GHz + 4,826,755,817 instructions # 2.26 insn per cycle + 0.743342702 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.083345e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.085639e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.085639e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.658894 sec - 1,877,804,985 cycles # 2.834 GHz - 4,274,294,161 instructions # 2.28 insn per cycle - 0.663396758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.219482e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.221665e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.221665e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.647740 sec + 1,878,162,742 cycles # 2.884 GHz + 4,259,215,943 instructions # 2.27 insn per cycle + 0.652438590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,36 +204,36 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.030887e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.033160e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.033160e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.757777 sec - 1,359,709,959 cycles # 1.785 GHz - 2,159,410,769 instructions # 1.59 insn per cycle - 0.762416842 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) +EvtsPerSec[Rmb+ME] (23) = ( 7.494885e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.497437e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.497437e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.709876 sec + 1,351,727,984 cycles # 1.894 GHz + 2,148,012,718 instructions # 1.59 insn per cycle + 0.714595281 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index a6a593e43c..b474950e2a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:44:27 +DATE: 2024-01-25_23:17:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.747119e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.748060e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.748340e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.767740e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.768599e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.768840e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.659570 sec - 5,920,808,418 cycles # 3.063 GHz - 12,044,538,664 instructions # 2.03 insn per cycle - 1.991771483 seconds time elapsed +TOTAL : 1.689539 sec + 5,904,330,680 cycles # 3.057 GHz + 12,131,099,993 instructions # 2.05 insn per cycle + 2.040870008 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.325940e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.326739e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.326842e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.326271e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.327058e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.327155e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.907449 sec - 6,693,274,890 cycles # 3.061 GHz - 12,994,667,842 instructions # 1.94 insn per cycle - 2.242820084 seconds time elapsed +TOTAL : 1.919073 sec + 6,721,403,827 cycles # 3.061 GHz + 13,909,706,030 instructions # 2.07 insn per cycle + 2.252811447 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.023898e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.024201e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.024201e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.194831e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.195117e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.195117e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.861507 sec - 17,865,192,522 cycles # 3.046 GHz - 53,579,122,611 instructions # 3.00 insn per cycle - 5.865565703 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.760749 sec + 17,569,361,335 cycles # 3.054 GHz + 51,761,329,976 instructions # 2.95 insn per cycle + 5.767466141 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087582491E-003 -Relative difference = 2.1198118933954545e-08 +Avg ME (F77/C++) = 9.8479612087313262E-003 +Relative difference = 2.1195385077844924e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.621823e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622258e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622258e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.463287 sec - 4,558,245,342 cycles # 3.108 GHz - 13,754,988,539 instructions # 3.02 insn per cycle - 1.467375257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.611039e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611537e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611537e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.472926 sec + 4,539,290,938 cycles # 3.083 GHz + 13,757,920,525 instructions # 3.03 insn per cycle + 1.488362352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896225560E-003 -Relative difference = 3.151694379513441e-08 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.105620e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.107481e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.107481e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.747732 sec - 2,126,324,616 cycles # 2.830 GHz - 4,818,424,886 instructions # 2.27 insn per cycle - 0.751960174 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.269368e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.271114e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.271114e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.735827 sec + 2,125,786,473 cycles # 2.890 GHz + 4,826,595,787 instructions # 2.27 insn per cycle + 0.748208117 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.303282e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.305517e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.305517e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.641081 sec - 1,865,331,641 cycles # 2.893 GHz - 4,275,221,232 instructions # 2.29 insn per cycle - 0.645264707 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.835657e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.837931e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.837931e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.682597 sec + 1,877,053,974 cycles # 2.749 GHz + 4,259,114,516 instructions # 2.27 insn per cycle + 0.697649288 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,35 +188,35 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.530995e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.533488e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.533488e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.706631 sec - 1,357,570,001 cycles # 1.912 GHz - 2,164,337,331 instructions # 1.59 insn per cycle - 0.710707193 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) +EvtsPerSec[Rmb+ME] (23) = ( 7.527647e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.530226e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.530226e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.709194 sec + 1,355,297,540 cycles # 1.909 GHz + 2,148,050,912 instructions # 1.58 insn per cycle + 0.723531266 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982955140E-003 -Relative difference = 2.0044060904369713e-08 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 73f6f01a0a..8d2448ca7c 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:45:14 +DATE: 2024-01-25_23:18:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691784e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.692281e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.692424e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.696404e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.696917e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.697059e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.167507 sec - 7,673,265,360 cycles # 3.083 GHz - 16,847,311,173 instructions # 2.20 insn per cycle - 2.544843357 seconds time elapsed +TOTAL : 2.203037 sec + 7,518,198,042 cycles # 3.016 GHz + 16,792,821,670 instructions # 2.23 insn per cycle + 2.605923217 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.109611e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.109929e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.109960e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.109293e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.109610e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109640e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.399118 sec - 11,512,402,509 cycles # 3.091 GHz - 25,362,588,706 instructions # 2.20 insn per cycle - 3.780006761 seconds time elapsed +TOTAL : 3.398117 sec + 11,465,669,090 cycles # 3.081 GHz + 24,712,533,321 instructions # 2.16 insn per cycle + 3.778631492 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.998480e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.998694e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.998694e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.507104e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.507413e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.507413e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.606791 sec - 19,173,459,975 cycles # 2.901 GHz - 54,151,769,179 instructions # 2.82 insn per cycle - 6.611030036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.220283 sec + 19,249,910,006 cycles # 3.096 GHz + 55,389,631,331 instructions # 2.88 insn per cycle + 6.227350768 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.627384e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.627470e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.627470e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.636577e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.636668e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.636668e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.249018 sec - 9,449,828,715 cycles # 2.906 GHz - 26,158,243,948 instructions # 2.77 insn per cycle - 3.253176983 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.235386 sec + 9,345,666,755 cycles # 2.888 GHz + 25,874,626,472 instructions # 2.77 insn per cycle + 3.246263387 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.818750e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.819217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.819217e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.840519e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.841057e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.841057e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.387571 sec - 4,051,003,122 cycles # 2.912 GHz - 9,227,088,631 instructions # 2.28 insn per cycle - 1.391733244 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) +TOTAL : 1.382711 sec + 3,999,150,321 cycles # 2.888 GHz + 9,120,253,087 instructions # 2.28 insn per cycle + 1.394370004 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.365519e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.366141e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.366141e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.372720e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.373367e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.373367e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.214524 sec - 3,540,300,515 cycles # 2.907 GHz - 8,174,532,807 instructions # 2.31 insn per cycle - 1.218602300 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) +TOTAL : 1.214906 sec + 3,512,659,413 cycles # 2.887 GHz + 8,030,227,038 instructions # 2.29 insn per cycle + 1.227544699 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.824767e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.825436e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.825436e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.939019e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.939632e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.939632e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.386311 sec - 2,658,660,002 cycles # 1.913 GHz - 4,154,162,009 instructions # 1.56 insn per cycle - 1.390371345 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) +TOTAL : 1.349746 sec + 2,597,586,119 cycles # 1.922 GHz + 4,075,915,870 instructions # 1.57 insn per cycle + 1.364260117 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index d4fe0b979f..0ea566946e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:46:15 +DATE: 2024-01-25_23:19:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.682422e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.682989e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.683143e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.694468e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.695070e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.695213e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.171969 sec - 7,673,087,653 cycles # 3.078 GHz - 17,236,869,851 instructions # 2.25 insn per cycle - 2.550325638 seconds time elapsed +TOTAL : 2.171307 sec + 7,653,950,084 cycles # 3.075 GHz + 15,514,206,763 instructions # 2.03 insn per cycle + 2.552702695 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.106682e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106997e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107029e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.105262e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.105578e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105614e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.401616 sec - 11,548,737,077 cycles # 3.091 GHz - 25,085,297,801 instructions # 2.17 insn per cycle - 3.792911760 seconds time elapsed +TOTAL : 3.405437 sec + 11,415,976,705 cycles # 3.063 GHz + 24,971,465,951 instructions # 2.19 insn per cycle + 3.786382890 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.474149e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.474398e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.474398e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.498313e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.498549e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.498549e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.238723 sec - 19,082,506,717 cycles # 3.058 GHz - 54,152,929,714 instructions # 2.84 insn per cycle - 6.242654091 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.221756 sec + 19,207,086,158 cycles # 3.086 GHz + 55,417,722,841 instructions # 2.89 insn per cycle + 6.226328389 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640779e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.640877e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.640877e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.599487e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.599575e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.599575e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.222880 sec - 9,391,551,742 cycles # 2.911 GHz - 26,077,333,462 instructions # 2.78 insn per cycle - 3.227016085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.303650 sec + 9,295,885,471 cycles # 2.810 GHz + 25,822,717,776 instructions # 2.78 insn per cycle + 3.314418026 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.773302e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.773766e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.773766e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.826926e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.827419e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.827419e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.403808 sec - 4,073,842,959 cycles # 2.895 GHz - 9,213,198,457 instructions # 2.26 insn per cycle - 1.407971002 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) +TOTAL : 1.384736 sec + 3,999,912,467 cycles # 2.881 GHz + 9,099,410,094 instructions # 2.27 insn per cycle + 1.394578954 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.338283e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.338969e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.338969e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.408309e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.408938e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.408938e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.222714 sec - 3,541,596,074 cycles # 2.889 GHz - 8,167,599,536 instructions # 2.31 insn per cycle - 1.226930952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) +TOTAL : 1.203548 sec + 3,478,041,980 cycles # 2.880 GHz + 8,009,965,268 instructions # 2.30 insn per cycle + 1.214113762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.888294e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.888912e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.888912e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.878573e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.879207e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879207e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.363883 sec - 2,618,199,272 cycles # 1.915 GHz - 4,152,923,340 instructions # 1.59 insn per cycle - 1.368029549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) +TOTAL : 1.367393 sec + 2,595,773,085 cycles # 1.893 GHz + 4,065,040,917 instructions # 1.57 insn per cycle + 1.378860552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 293e7e906a..190eac2ebb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:39:50 +DATE: 2024-01-25_23:12:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.679868e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.317378e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.685862e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.660405e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.252171e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.620562e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.442326 sec - 1,985,114,674 cycles # 3.012 GHz - 2,801,190,832 instructions # 1.41 insn per cycle - 0.731797225 seconds time elapsed +TOTAL : 0.445028 sec + 1,996,271,131 cycles # 3.002 GHz + 2,802,769,990 instructions # 1.40 insn per cycle + 0.740314258 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.245024e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.095249e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.522546e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.248695e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.090608e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.502306e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523645 sec - 2,298,323,986 cycles # 3.021 GHz - 3,281,368,879 instructions # 1.43 insn per cycle - 0.818048829 seconds time elapsed +TOTAL : 0.528546 sec + 2,313,689,489 cycles # 3.026 GHz + 3,291,661,091 instructions # 1.42 insn per cycle + 0.824068617 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.100295e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.122827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.122827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.058020e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.079223e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.079223e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.510825 sec - 4,703,874,541 cycles # 3.106 GHz - 13,462,408,396 instructions # 2.86 insn per cycle - 1.520640560 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.570332 sec + 4,875,303,116 cycles # 3.097 GHz + 13,800,130,515 instructions # 2.83 insn per cycle + 1.577794250 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.958580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.032826e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.032826e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.033034e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112454e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.858107 sec - 2,618,495,660 cycles # 3.035 GHz - 7,553,259,233 instructions # 2.88 insn per cycle - 0.870670449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.827517 sec + 2,561,602,187 cycles # 3.079 GHz + 7,401,027,646 instructions # 2.89 insn per cycle + 0.842056783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.413914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.637783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.637783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.412853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.634825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.634825e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501429 sec - 1,476,425,844 cycles # 2.921 GHz - 3,120,626,738 instructions # 2.11 insn per cycle - 0.516624124 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +TOTAL : 0.501571 sec + 1,470,508,906 cycles # 2.907 GHz + 3,136,855,793 instructions # 2.13 insn per cycle + 0.517188475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.050087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.050087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.868351e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.162767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162767e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.454903 sec - 1,341,062,737 cycles # 2.922 GHz - 2,982,346,892 instructions # 2.22 insn per cycle - 0.471560702 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +TOTAL : 0.445204 sec + 1,306,234,962 cycles # 2.906 GHz + 2,923,536,957 instructions # 2.24 insn per cycle + 0.456330410 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.570252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.695598e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.695598e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.674321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.813205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.813205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.661079 sec - 1,328,110,043 cycles # 1.997 GHz - 1,954,412,178 instructions # 1.47 insn per cycle - 0.674908188 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +TOTAL : 0.636437 sec + 1,265,525,690 cycles # 1.974 GHz + 1,899,766,661 instructions # 1.50 insn per cycle + 0.650714621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index d85e5ad544..dcc832b3ed 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_15:12:44 +DATE: 2024-01-25_23:55:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.546178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.153075e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.153075e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.666168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.155134e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.155134e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.474399 sec - 1,996,543,423 cycles # 2.914 GHz - 2,976,710,186 instructions # 1.49 insn per cycle - 0.743100653 seconds time elapsed +TOTAL : 0.470776 sec + 2,054,044,928 cycles # 2.998 GHz + 3,046,852,590 instructions # 1.48 insn per cycle + 0.744657602 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.225202e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.276966e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.276966e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.291770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.278677e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278677e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757945 sec - 2,951,779,199 cycles # 2.926 GHz - 4,535,994,570 instructions # 1.54 insn per cycle - 1.067442306 seconds time elapsed +TOTAL : 0.751856 sec + 3,008,653,088 cycles # 3.008 GHz + 4,577,018,873 instructions # 1.52 insn per cycle + 1.059330773 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.063685e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.086099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.086099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.050038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071441e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071441e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.568160 sec - 4,739,058,277 cycles # 3.015 GHz - 13,467,544,042 instructions # 2.84 insn per cycle - 1.572567564 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.587680 sec + 4,905,788,321 cycles # 3.082 GHz + 13,807,026,481 instructions # 2.81 insn per cycle + 1.592794582 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.915883e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.989942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.989942e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.015012e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.094006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094006e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.884700 sec - 2,669,440,783 cycles # 3.003 GHz - 7,602,375,116 instructions # 2.85 insn per cycle - 0.889666740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.842101 sec + 2,593,834,262 cycles # 3.065 GHz + 7,449,872,135 instructions # 2.87 insn per cycle + 0.847204065 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,27 +148,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.224315e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.444185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.444185e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.388938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.615019e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.615019e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.539421 sec - 1,532,896,663 cycles # 2.819 GHz - 3,168,696,255 instructions # 2.07 insn per cycle - 0.544640536 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +TOTAL : 0.512487 sec + 1,503,564,380 cycles # 2.909 GHz + 3,186,663,979 instructions # 2.12 insn per cycle + 0.517541584 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -183,20 +183,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.619993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.881959e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.881959e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.812650e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.096476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.096476e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.481010 sec - 1,371,011,914 cycles # 2.828 GHz - 3,030,736,025 instructions # 2.21 insn per cycle - 0.485465230 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +TOTAL : 0.458891 sec + 1,341,696,556 cycles # 2.900 GHz + 2,973,096,768 instructions # 2.22 insn per cycle + 0.464005147 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -211,20 +211,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.415824e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.530854e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.530854e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.676739e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.814197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.814197e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.708748 sec - 1,355,688,416 cycles # 1.903 GHz - 1,991,539,116 instructions # 1.47 insn per cycle - 0.713346181 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +TOTAL : 0.642150 sec + 1,294,550,001 cycles # 2.004 GHz + 1,936,842,258 instructions # 1.50 insn per cycle + 0.647156528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 46adce5ba7..c7ee729841 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:07 +DATE: 2024-01-25_23:13:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.664367e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.214901e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.575772e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.661110e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.191439e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.542241e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.441294 sec - 1,975,560,580 cycles # 3.010 GHz - 2,812,494,086 instructions # 1.42 insn per cycle - 0.721240835 seconds time elapsed +TOTAL : 0.447354 sec + 1,962,226,451 cycles # 2.919 GHz + 2,807,157,696 instructions # 1.43 insn per cycle + 0.740877930 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.215656e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.999841e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.407602e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.268430e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.027340e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.439288e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523997 sec - 2,293,718,680 cycles # 3.009 GHz - 3,268,283,620 instructions # 1.42 insn per cycle - 0.819195532 seconds time elapsed +TOTAL : 0.527954 sec + 2,295,849,167 cycles # 2.992 GHz + 3,276,865,701 instructions # 1.43 insn per cycle + 0.824712620 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.074061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096252e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096252e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.058942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.080298e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.547162 sec - 4,714,230,759 cycles # 3.040 GHz - 13,457,176,123 instructions # 2.85 insn per cycle - 1.553898636 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.568275 sec + 4,868,810,341 cycles # 3.097 GHz + 13,807,135,682 instructions # 2.84 insn per cycle + 1.575280536 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993758e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.068724e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.068724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.039912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118197e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.842619 sec - 2,622,041,010 cycles # 3.096 GHz - 7,552,388,899 instructions # 2.88 insn per cycle - 0.855874832 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.824293 sec + 2,558,754,803 cycles # 3.088 GHz + 7,406,533,816 instructions # 2.89 insn per cycle + 0.837018986 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.386860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608787e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.376928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600038e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600038e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.505002 sec - 1,482,553,157 cycles # 2.917 GHz - 3,119,022,546 instructions # 2.10 insn per cycle - 0.516023532 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) +TOTAL : 0.507142 sec + 1,477,740,773 cycles # 2.889 GHz + 3,137,433,888 instructions # 2.12 insn per cycle + 0.518098986 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.786208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060798e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060798e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.840459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.127687e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.127687e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.453959 sec - 1,338,340,378 cycles # 2.921 GHz - 2,979,731,487 instructions # 2.23 insn per cycle - 0.465473907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) +TOTAL : 0.448212 sec + 1,306,019,831 cycles # 2.885 GHz + 2,925,357,996 instructions # 2.24 insn per cycle + 0.463433458 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.592500e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.720014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.720014e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.679673e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.821382e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.821382e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.655376 sec - 1,326,766,373 cycles # 2.012 GHz - 1,952,310,196 instructions # 1.47 insn per cycle - 0.669844706 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) +TOTAL : 0.635102 sec + 1,265,111,451 cycles # 1.978 GHz + 1,899,601,810 instructions # 1.50 insn per cycle + 0.647364895 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 271711c5ee..c7d9af7104 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:25 +DATE: 2024-01-25_23:13:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.342241e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209752e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.345410e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.458768e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.221749e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.363529e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.439415 sec - 1,959,210,302 cycles # 2.998 GHz - 2,778,383,612 instructions # 1.42 insn per cycle - 0.742350357 seconds time elapsed +TOTAL : 0.437838 sec + 1,957,049,768 cycles # 2.996 GHz + 2,769,299,997 instructions # 1.42 insn per cycle + 0.724675379 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.198954e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.802596e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953593e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.255427e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.817568e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965420e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476253 sec - 2,095,785,186 cycles # 2.994 GHz - 2,918,085,144 instructions # 1.39 insn per cycle - 0.759290235 seconds time elapsed +TOTAL : 0.480738 sec + 2,090,911,516 cycles # 2.954 GHz + 2,970,490,097 instructions # 1.42 insn per cycle + 0.765001287 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.156035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.182076e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.182076e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.437672 sec - 4,454,268,487 cycles # 3.090 GHz - 13,048,356,445 instructions # 2.93 insn per cycle - 1.444257610 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.184289e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.211974e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.403480 sec + 4,337,769,666 cycles # 3.083 GHz + 12,596,317,616 instructions # 2.90 insn per cycle + 1.410558942 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.036696e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.233511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.233511e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.321037e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548912e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548912e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.559663 sec - 1,702,113,504 cycles # 3.019 GHz - 4,513,068,480 instructions # 2.65 insn per cycle - 0.574904754 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.513015 sec + 1,589,233,522 cycles # 3.072 GHz + 4,246,299,525 instructions # 2.67 insn per cycle + 0.528221565 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.119769e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.899660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.899660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.129047e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.905203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.905203e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287512 sec - 851,994,706 cycles # 2.923 GHz - 1,897,184,726 instructions # 2.23 insn per cycle - 0.299631508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +TOTAL : 0.287410 sec + 849,542,990 cycles # 2.910 GHz + 1,915,700,197 instructions # 2.25 insn per cycle + 0.298653599 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.144841e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.980131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.980131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.711177e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.657175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.657175e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287863 sec - 803,737,999 cycles # 2.749 GHz - 1,820,191,781 instructions # 2.26 insn per cycle - 0.309536059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +TOTAL : 0.264085 sec + 779,082,359 cycles # 2.903 GHz + 1,797,571,945 instructions # 2.31 insn per cycle + 0.277627025 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.909283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.426697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.426697e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.019451e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.552029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.552029e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.356091 sec - 735,639,321 cycles # 2.041 GHz - 1,305,394,838 instructions # 1.77 insn per cycle - 0.370547154 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) +TOTAL : 0.348741 sec + 718,379,002 cycles # 2.035 GHz + 1,287,825,813 instructions # 1.79 insn per cycle + 0.361391527 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 9afdfb410c..b3562e3bb2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_15:13:02 +DATE: 2024-01-25_23:55:30 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.454543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.985559e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.985559e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.713802e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058372e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058372e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.454607 sec - 1,923,330,877 cycles # 2.893 GHz - 2,869,826,397 instructions # 1.49 insn per cycle - 0.721979990 seconds time elapsed +TOTAL : 0.450219 sec + 1,998,063,586 cycles # 3.001 GHz + 2,936,698,123 instructions # 1.47 insn per cycle + 0.725182400 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.025147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.561466e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.561466e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.246442e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.591827e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.591827e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.624986 sec - 2,513,731,167 cycles # 2.923 GHz - 3,800,964,888 instructions # 1.51 insn per cycle - 0.917368993 seconds time elapsed +TOTAL : 0.614457 sec + 2,546,512,744 cycles # 3.012 GHz + 3,859,457,418 instructions # 1.52 insn per cycle + 0.903348815 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.088101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112561e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.529985 sec - 4,473,562,903 cycles # 2.917 GHz - 13,053,120,792 instructions # 2.92 insn per cycle - 1.534351278 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.185133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.213037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.213037e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.405427 sec + 4,352,574,944 cycles # 3.089 GHz + 12,601,058,264 instructions # 2.90 insn per cycle + 1.410092171 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.988941e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.180615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.180615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.310454e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.536044e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.536044e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.573010 sec - 1,718,819,301 cycles # 2.980 GHz - 4,560,439,401 instructions # 2.65 insn per cycle - 0.577426659 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.518604 sec + 1,608,829,031 cycles # 3.079 GHz + 4,293,781,748 instructions # 2.67 insn per cycle + 0.523462288 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,27 +148,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.851561e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.587029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587029e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.559224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.260897e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.260897e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.304354 sec - 870,497,984 cycles # 2.826 GHz - 1,933,292,389 instructions # 2.22 insn per cycle - 0.308728297 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +TOTAL : 0.321312 sec + 876,133,160 cycles # 2.693 GHz + 1,951,948,663 instructions # 2.23 insn per cycle + 0.326491770 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.204206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.044292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.044292e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.592091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.515294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.515294e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.288758 sec - 819,562,722 cycles # 2.802 GHz - 1,856,394,583 instructions # 2.27 insn per cycle - 0.293079680 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +TOTAL : 0.272487 sec + 796,404,987 cycles # 2.881 GHz + 1,834,071,112 instructions # 2.30 insn per cycle + 0.277371242 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,27 +204,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.631074e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.091279e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.091279e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.062069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.591594e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.591594e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.380781 sec - 752,756,759 cycles # 1.958 GHz - 1,346,012,282 instructions # 1.79 insn per cycle - 0.385182072 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) +TOTAL : 0.349762 sec + 735,881,550 cycles # 2.080 GHz + 1,329,099,019 instructions # 1.81 insn per cycle + 0.354618508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8032039f3c..6336756753 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:42 +DATE: 2024-01-25_23:13:35 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.261838e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178248e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.310022e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.370025e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.209260e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.344655e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.437409 sec - 1,963,259,812 cycles # 3.015 GHz - 2,775,580,273 instructions # 1.41 insn per cycle - 0.717773083 seconds time elapsed +TOTAL : 0.438443 sec + 1,955,337,784 cycles # 2.997 GHz + 2,770,402,702 instructions # 1.42 insn per cycle + 0.721946226 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.175724e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.774857e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916629e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.164313e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.773172e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.912434e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.475371 sec - 2,078,846,085 cycles # 2.960 GHz - 2,876,831,626 instructions # 1.38 insn per cycle - 0.759711014 seconds time elapsed +TOTAL : 0.474687 sec + 2,104,150,807 cycles # 2.998 GHz + 2,986,785,400 instructions # 1.42 insn per cycle + 0.760875003 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.164325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.190669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.190669e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.426803 sec - 4,442,867,476 cycles # 3.106 GHz - 13,028,535,311 instructions # 2.93 insn per cycle - 1.433195231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.186035e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.213662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.213662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.401162 sec + 4,337,750,504 cycles # 3.088 GHz + 12,587,552,850 instructions # 2.90 insn per cycle + 1.407863482 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.121532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.324278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.556123e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.556123e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.543798 sec - 1,695,653,127 cycles # 3.095 GHz - 4,509,334,487 instructions # 2.66 insn per cycle - 0.556325987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.512613 sec + 1,585,533,498 cycles # 3.068 GHz + 4,241,255,263 instructions # 2.67 insn per cycle + 0.528427282 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.122817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890273e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.890273e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.124568e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.922590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.922590e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287226 sec - 852,393,433 cycles # 2.926 GHz - 1,893,938,143 instructions # 2.22 insn per cycle - 0.301768010 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) +TOTAL : 0.287324 sec + 845,792,185 cycles # 2.901 GHz + 1,913,732,633 instructions # 2.26 insn per cycle + 0.297586352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.582926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.478799e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.478799e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.758792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.702266e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.702266e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.268312 sec - 800,191,089 cycles # 2.937 GHz - 1,816,120,065 instructions # 2.27 insn per cycle - 0.281438188 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) +TOTAL : 0.261995 sec + 775,948,492 cycles # 2.911 GHz + 1,795,856,064 instructions # 2.31 insn per cycle + 0.275868058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.957580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.456912e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.456912e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.998817e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.512870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.512870e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.351938 sec - 734,061,618 cycles # 2.062 GHz - 1,302,951,487 instructions # 1.77 insn per cycle - 0.362652515 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) +TOTAL : 0.349663 sec + 716,382,396 cycles # 2.024 GHz + 1,286,477,839 instructions # 1.80 insn per cycle + 0.363209331 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 1f5cde87e5..ab23ae8079 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:58 +DATE: 2024-01-25_23:13:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.593637e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.233261e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.638999e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.673057e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.328050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.712475e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.449003 sec - 1,949,950,502 cycles # 2.943 GHz - 2,732,368,699 instructions # 1.40 insn per cycle - 0.743857131 seconds time elapsed +TOTAL : 0.442690 sec + 1,984,783,264 cycles # 3.005 GHz + 2,793,428,342 instructions # 1.41 insn per cycle + 0.734598388 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.250012e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.124195e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.543411e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.268703e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.159984e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.588880e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.532370 sec - 2,306,545,413 cycles # 3.002 GHz - 3,257,931,207 instructions # 1.41 insn per cycle - 0.827800812 seconds time elapsed +TOTAL : 0.529813 sec + 2,295,133,443 cycles # 3.001 GHz + 3,278,190,514 instructions # 1.43 insn per cycle + 0.824550304 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.090450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.925762e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.013036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013036e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.524359 sec - 4,736,940,349 cycles # 3.101 GHz - 13,465,176,292 instructions # 2.84 insn per cycle - 1.531673516 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.674491 sec + 4,902,858,967 cycles # 2.922 GHz + 13,824,814,632 instructions # 2.82 insn per cycle + 1.682421523 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.007098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.083721e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.083721e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.985930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.061989e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.061989e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.837584 sec - 2,604,823,117 cycles # 3.095 GHz - 7,385,606,203 instructions # 2.84 insn per cycle - 0.848813750 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.846811 sec + 2,593,448,297 cycles # 3.047 GHz + 7,349,153,746 instructions # 2.83 insn per cycle + 0.862989336 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.452149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.684416e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.684416e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.415408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.638451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.638451e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.496103 sec - 1,465,319,733 cycles # 2.929 GHz - 3,056,071,451 instructions # 2.09 insn per cycle - 0.510799326 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) +TOTAL : 0.501362 sec + 1,467,201,166 cycles # 2.901 GHz + 3,084,117,020 instructions # 2.10 insn per cycle + 0.513174865 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.858923e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.141930e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.141930e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.707680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.990043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.990043e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446088 sec - 1,306,981,940 cycles # 2.902 GHz - 2,931,113,574 instructions # 2.24 insn per cycle - 0.461243420 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) +TOTAL : 0.464354 sec + 1,278,148,137 cycles # 2.726 GHz + 2,873,006,643 instructions # 2.25 insn per cycle + 0.476021326 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.504343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.619127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.619127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.601061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.728349e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.728349e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.677792 sec - 1,365,651,157 cycles # 2.003 GHz - 1,970,379,541 instructions # 1.44 insn per cycle - 0.689595363 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) +TOTAL : 0.653676 sec + 1,303,858,267 cycles # 1.981 GHz + 1,914,918,168 instructions # 1.47 insn per cycle + 0.665372239 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index a8d85dd2f3..a6c3b7ce72 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:41:16 +DATE: 2024-01-25_23:14:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.654184e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.189821e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.540640e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.648457e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.178614e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.521473e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444065 sec - 2,018,867,268 cycles # 3.015 GHz - 2,835,456,943 instructions # 1.40 insn per cycle - 0.738016347 seconds time elapsed +TOTAL : 0.444086 sec + 1,991,914,568 cycles # 2.998 GHz + 2,768,343,966 instructions # 1.39 insn per cycle + 0.739601436 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.223829e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.984831e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.384004e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.227832e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.974511e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.384988e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.536961 sec - 2,246,742,038 cycles # 2.906 GHz - 3,169,759,663 instructions # 1.41 insn per cycle - 0.832088924 seconds time elapsed +TOTAL : 0.526758 sec + 2,298,196,221 cycles # 2.993 GHz + 3,278,717,607 instructions # 1.43 insn per cycle + 0.825350264 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.089936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112348e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112348e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.051083e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.072032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.072032e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.524608 sec - 4,732,046,885 cycles # 3.096 GHz - 13,451,186,768 instructions # 2.84 insn per cycle - 1.531974912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.580057 sec + 4,897,834,610 cycles # 3.092 GHz + 13,831,057,972 instructions # 2.82 insn per cycle + 1.587486400 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.078884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.078884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.991090e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.066032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.066032e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.838343 sec - 2,601,051,226 cycles # 3.087 GHz - 7,389,258,616 instructions # 2.84 insn per cycle - 0.853180948 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.844299 sec + 2,597,084,036 cycles # 3.060 GHz + 7,352,270,683 instructions # 2.83 insn per cycle + 0.857215046 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440885e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.665503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.665503e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.410530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.636744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.636744e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.497494 sec - 1,467,074,103 cycles # 2.924 GHz - 3,056,319,919 instructions # 2.08 insn per cycle - 0.512504484 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) +TOTAL : 0.502125 sec + 1,464,778,292 cycles # 2.890 GHz + 3,085,005,315 instructions # 2.11 insn per cycle + 0.516998713 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.870867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.157659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.928508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.233519e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.233519e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444644 sec - 1,306,078,783 cycles # 2.911 GHz - 2,931,883,300 instructions # 2.24 insn per cycle - 0.459998118 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) +TOTAL : 0.438651 sec + 1,279,923,095 cycles # 2.887 GHz + 2,874,967,565 instructions # 2.25 insn per cycle + 0.449214976 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.456308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.570811e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.570811e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.608751e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738501e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.690659 sec - 1,366,599,933 cycles # 1.966 GHz - 1,970,195,836 instructions # 1.44 insn per cycle - 0.702450567 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) +TOTAL : 0.651858 sec + 1,302,154,592 cycles # 1.984 GHz + 1,915,377,243 instructions # 1.47 insn per cycle + 0.663461748 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. From 39c63725667b3bdb331e1108cdbc3f77af63e1bd Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 11:48:16 +0100 Subject: [PATCH 45/96] [jt744] rerun 18 tmad tests, all ok STARTED AT Fri Jan 26 12:08:35 AM CET 2024 ENDED AT Fri Jan 26 04:27:21 AM CET 2024 Status=0 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 208 ++++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 194 +++++++-------- .../log_eemumu_mad_m_inl0_hrd0.txt | 198 +++++++-------- .../log_ggtt_mad_d_inl0_hrd0.txt | 200 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 220 ++++++++--------- .../log_ggtt_mad_m_inl0_hrd0.txt | 218 ++++++++--------- .../log_ggttg_mad_d_inl0_hrd0.txt | 196 +++++++-------- .../log_ggttg_mad_f_inl0_hrd0.txt | 228 +++++++++--------- .../log_ggttg_mad_m_inl0_hrd0.txt | 212 ++++++++-------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 214 ++++++++-------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 226 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 208 ++++++++-------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 218 ++++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 226 ++++++++--------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 218 ++++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 214 ++++++++-------- .../log_gqttq_mad_f_inl0_hrd0.txt | 212 ++++++++-------- .../log_gqttq_mad_m_inl0_hrd0.txt | 200 +++++++-------- 18 files changed, 1905 insertions(+), 1905 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 67b7aa5182..44df599739 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -4,9 +4,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 + +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:02 +DATE: 2024-01-26_00:15:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6399s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6077s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5995s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1828s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1723s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1639s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.81E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4504s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3577s - [COUNTERS] Fortran MEs ( 1 ) : 0.0927s for 90112 events => throughput is 9.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s + [COUNTERS] Fortran MEs ( 1 ) : 0.0875s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.2021s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1952s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1830s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4477s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0766s for 90112 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4111s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0755s for 90112 events => throughput is 1.19E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.222047e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.137053e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138245e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1890s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1847s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1762s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3996s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 90112 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3745s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3311s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 90112 events => throughput is 2.08E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.953284e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.954813e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.976393e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021256e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1867s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1835s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1687s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.82E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3874s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3527s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0347s for 90112 events => throughput is 2.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3639s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3313s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0326s for 90112 events => throughput is 2.77E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.554304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.643708e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.675247e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.780965e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1863s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1833s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.70E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1714s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1687s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.01E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3840s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3511s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 90112 events => throughput is 2.74E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3612s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3308s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0304s for 90112 events => throughput is 2.96E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.686020e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.919552e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.793515e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.939142e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1887s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1763s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1732s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3954s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3543s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0411s for 90112 events => throughput is 2.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3723s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3369s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0354s for 90112 events => throughput is 2.54E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.045622e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.350967e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.176762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506114e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6011s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6006s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5873s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7816s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.73E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7561s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.85E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.145579e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.149558e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.960559e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.952056e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.712501e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.699705e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.408619e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.444629e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718171e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.729024e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.071930e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.979970e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.709644e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.739033e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.143512e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.144973e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 79af27bb3b..bbf79e30e3 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,11 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:20 +DATE: 2024-01-26_00:15:41 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,8 +59,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6452s + [COUNTERS] PROGRAM TOTAL : 0.6033s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5953s [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1876s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1795s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1722s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1642s + [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4462s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3548s - [COUNTERS] Fortran MEs ( 1 ) : 0.0914s for 90112 events => throughput is 9.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3253s + [COUNTERS] Fortran MEs ( 1 ) : 0.0873s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1915s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 8192 events => throughput is 1.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1860s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166087172673) differ by less than 4E-4 (1.369147908381052e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165492032638) differ by less than 4E-4 (1.6428111293542713e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4361s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 90112 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3355s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0714s for 90112 events => throughput is 1.26E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501907796603360E-002) differ by less than 4E-4 (1.3232739060065057e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905274264717E-002) differ by less than 4E-4 (1.5989335488963974e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.253065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219752e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.252116e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242681e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1729s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1704s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.16E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3483s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3582s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 90112 events => throughput is 3.31E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263453308896e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263464411127e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.111730e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.347573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.395043e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1874s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1717s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1695s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3552s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 90112 events => throughput is 3.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3554s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3312s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 90112 events => throughput is 3.73E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632103443406e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.438480e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.590394e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639168e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.759019e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1699s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.02E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3553s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 90112 events => throughput is 3.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3589s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3361s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632103443406e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.503584e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.954153e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.715865e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.260132e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1893s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1871s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1746s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1725s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.90E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166440400542) differ by less than 4E-4 (1.20672314918302e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166446533123) differ by less than 4E-4 (1.2039032049049325e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0264s for 90112 events => throughput is 3.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3577s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3338s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 90112 events => throughput is 3.77E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501908978565555E-002) differ by less than 4E-4 (1.194100419654731e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501908990866423E-002) differ by less than 4E-4 (1.1927560927826875e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.231359e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760450e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.611631e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.994942e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6017s + [COUNTERS] PROGRAM TOTAL : 0.5868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5864s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.67E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7749s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7703s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.95E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7531s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7484s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.93E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439939722975e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439961927435e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.534234e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.617084e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.793140e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.934097e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.972505e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.941825e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.054646e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.057960e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.870406e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.888538e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.242224e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.240921e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.358520e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.397983e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.400117e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.459127e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 52cbc87cca..7bcac1c93d 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:37 +DATE: 2024-01-26_00:15:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6492s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6412s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6192s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6109s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.95E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1853s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1770s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1748s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1660s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.34E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4357s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3463s - [COUNTERS] Fortran MEs ( 1 ) : 0.0895s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4153s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3282s + [COUNTERS] Fortran MEs ( 1 ) : 0.0871s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1959s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1832s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.18E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211734) differ by less than 2E-4 (4.382159080051906e-10) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211736) differ by less than 2E-4 (4.3821613004979554e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 90112 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4303s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3512s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 90112 events => throughput is 1.14E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.181180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152102e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.190793e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149985e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1904s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1863s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1761s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.09E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211728) differ by less than 2E-4 (4.382156859605857e-10) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211734) differ by less than 2E-4 (4.382159080051906e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3979s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0443s for 90112 events => throughput is 2.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3322s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 90112 events => throughput is 2.12E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.044048e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.061680e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.969679e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.075365e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1719s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 90112 events => throughput is 2.63E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 90112 events => throughput is 2.65E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.2480907680442215e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.597342e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.542279e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.692233e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.701939e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1842s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1739s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1710s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.80E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0333s for 90112 events => throughput is 2.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3634s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3322s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0312s for 90112 events => throughput is 2.89E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.2480907680442215e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.726998e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.678475e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.851400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.985776e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1715s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.60E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3941s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 90112 events => throughput is 2.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3706s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3353s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0353s for 90112 events => throughput is 2.55E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.2480907680442215e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.197385e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336735e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.341038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.551256e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.65E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5877s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5872s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7703s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7537s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.83E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.964035e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.202928e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.893973e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960653e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.636276e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733664e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.478323e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.450045e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.697383e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728190e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.585042e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.031495e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.701325e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.704272e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.163459e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.151802e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 99bc7401b1..56f6811ac2 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:54 +DATE: 2024-01-26_00:16:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3268s - [COUNTERS] Fortran MEs ( 1 ) : 0.0417s for 8192 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3222s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2769s - [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3109s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2699s + [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7190s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2578s - [COUNTERS] Fortran MEs ( 1 ) : 0.4612s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7399s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2919s + [COUNTERS] Fortran MEs ( 1 ) : 0.4481s for 90112 events => throughput is 2.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3623s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3233s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3452s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7507s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3257s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4250s for 90112 events => throughput is 2.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7204s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3188s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4016s for 90112 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775372) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132082e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.108786e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.162141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.114532e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3055s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3157s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.88E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5423s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2971s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2451s for 90112 events => throughput is 3.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5307s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2313s for 90112 events => throughput is 3.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.834431e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.792482e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3041s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2904s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3007s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.31E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4320s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2832s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1488s for 90112 events => throughput is 6.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4410s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2952s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1458s for 90112 events => throughput is 6.18E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.020066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.990625e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.978626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.045420e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3050s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2945s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.32E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4144s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1349s for 90112 events => throughput is 6.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4155s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2915s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1240s for 90112 events => throughput is 7.27E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.660341e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.054793e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.696167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.092964e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3346s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0220s for 8192 events => throughput is 3.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3090s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2913s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0177s for 8192 events => throughput is 4.63E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5228s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2980s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2249s for 90112 events => throughput is 4.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4912s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2990s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1922s for 90112 events => throughput is 4.69E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.916207e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.673944e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.910036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.586634e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7194s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6952s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6992s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7071s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.057278e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042409e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.698426e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.702115e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.013072e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012892e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.074632e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068226e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.996049e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.025873e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152448e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.153095e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.011908e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.985581e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.029442e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.067889e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 61d36152df..9213f67fa2 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,12 +15,12 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:36:21 +DATE: 2024-01-26_00:16:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3714s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3296s - [COUNTERS] Fortran MEs ( 1 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3172s + [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3233s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2806s - [COUNTERS] Fortran MEs ( 1 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3118s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2712s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7540s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2823s - [COUNTERS] Fortran MEs ( 1 ) : 0.4717s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7504s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2981s + [COUNTERS] Fortran MEs ( 1 ) : 0.4524s for 90112 events => throughput is 1.99E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3524s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3175s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0365s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690706767555099) differ by less than 4E-4 (3.1663296096162696e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703999052587) differ by less than 4E-4 (8.971448917094449e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6953s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3909s for 90112 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7108s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3324s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3784s for 90112 events => throughput is 2.38E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782605295497) differ by less than 4E-4 (6.782658656945273e-09) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780103711483) differ by less than 4E-4 (4.733632297249102e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.343511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.393568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350968e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.399982e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3092s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0149s for 8192 events => throughput is 5.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3034s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2896s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.92E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690702885183541) differ by less than 4E-4 (1.1307059111231865e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690699958440689) differ by less than 4E-4 (1.744398380187917e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4474s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2803s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1671s for 90112 events => throughput is 5.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4532s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2989s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1542s for 90112 events => throughput is 5.84E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223778858016772) differ by less than 4E-4 (7.428553927546488e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223776162337749) differ by less than 4E-4 (1.326035499182865e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.223130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.691481e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.234366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.737589e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2951s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 8192 events => throughput is 9.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690694374060818) differ by less than 4E-4 (2.9153560099359765e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3687s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2793s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0894s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3754s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2908s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 90112 events => throughput is 1.07E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223775951815753) differ by less than 4E-4 (1.3715795843527445e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.987231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029810e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.011425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.032448e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2935s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2868s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.20E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690694374060818) differ by less than 4E-4 (2.9153560099359765e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3676s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0816s for 90112 events => throughput is 1.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4669s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3835s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0835s for 90112 events => throughput is 1.08E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223775951815753) differ by less than 4E-4 (1.3715795843527445e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.067146e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.125495e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131924e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3028s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2972s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2876s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.51E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690698914467276) differ by less than 4E-4 (1.9633033720989346e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690698822141186) differ by less than 4E-4 (1.982662718447159e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4045s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1163s for 90112 events => throughput is 7.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3978s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2918s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1060s for 90112 events => throughput is 8.50E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223780273983500) differ by less than 4E-4 (4.3652677583772004e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780266165058) differ by less than 4E-4 (4.382182106077437e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.363750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.016243e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.388485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.906782e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7047s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7042s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.50E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6945s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6940s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.49E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697980) differ by less than 4E-4 (1.0232396019382861e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697987) differ by less than 4E-4 (1.0232396008280631e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7059s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7003s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.63E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.6975s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6921s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.65E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376699659286e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376677454826e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.180729e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.316183e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.252831e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.020050e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.767543e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.761145e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.772780e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.782582e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.790178e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.827139e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.842637e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877247e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.336409e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.384483e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.382414e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.425158e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 0673f7e59b..9f96fafdb5 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,12 +1,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -16,14 +16,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:36:46 +DATE: 2024-01-26_00:17:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3604s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3188s - [COUNTERS] Fortran MEs ( 1 ) : 0.0416s for 8192 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3516s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3117s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2709s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7030s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2436s - [COUNTERS] Fortran MEs ( 1 ) : 0.4593s for 90112 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7545s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3018s + [COUNTERS] Fortran MEs ( 1 ) : 0.4527s for 90112 events => throughput is 1.99E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3545s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3161s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0384s for 8192 events => throughput is 2.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032026) differ by less than 2E-4 (2.7750309383733907e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032019) differ by less than 2E-4 (2.77503091616893e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7327s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4242s for 90112 events => throughput is 2.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7234s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3159s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4075s for 90112 events => throughput is 2.21E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ by less than 2E-4 (2.9065246431869696e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.135635e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.206337e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.125893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.189630e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3220s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3000s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3187s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0222s for 8192 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5189s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2816s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2374s for 90112 events => throughput is 3.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5255s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2260s for 90112 events => throughput is 3.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ by less than 2E-4 (2.9065246431869696e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.761231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.918959e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.742706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.840099e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2927s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2968s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.24E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709681138244) differ by less than 2E-4 (2.9430012205011735e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4221s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2763s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1458s for 90112 events => throughput is 6.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4387s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2942s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1446s for 90112 events => throughput is 6.23E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ by less than 2E-4 (2.9427636771828247e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.050162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.099865e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.113895e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.108305e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3016s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2829s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0114s for 8192 events => throughput is 7.21E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709681138244) differ by less than 2E-4 (2.9430012205011735e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4299s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2973s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1326s for 90112 events => throughput is 6.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4171s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2934s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1237s for 90112 events => throughput is 7.28E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ by less than 2E-4 (2.9427636771828247e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.791940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.031927e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.001376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.243269e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3188s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2994s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 8192 events => throughput is 4.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0174s for 8192 events => throughput is 4.70E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709681138244) differ by less than 2E-4 (2.9430012205011735e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5212s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3007s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2205s for 90112 events => throughput is 4.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5025s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3112s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1913s for 90112 events => throughput is 4.71E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ by less than 2E-4 (2.9427636771828247e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.985174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.533015e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.068643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.733679e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7071s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7066s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.0435s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0430s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690699) differ by less than 2E-4 (2.2875357164053867e-10) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690706) differ by less than 2E-4 (2.2875334959593374e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7018s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.39E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7112s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7048s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.014529e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.076384e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.676393e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.666577e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.016903e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.000653e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.060987e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068544e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006671e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.006931e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.140193e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138201e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.002803e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.024897e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.981810e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.036764e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 1e8a82a6de..7b087a9357 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,9 +1,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -16,14 +16,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:37:12 +DATE: 2024-01-26_00:17:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5593s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2322s - [COUNTERS] Fortran MEs ( 1 ) : 0.3271s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5536s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2268s + [COUNTERS] Fortran MEs ( 1 ) : 0.3268s for 8192 events => throughput is 2.51E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5556s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2267s - [COUNTERS] Fortran MEs ( 1 ) : 0.3290s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5482s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2237s + [COUNTERS] Fortran MEs ( 1 ) : 0.3244s for 8192 events => throughput is 2.52E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4373s - [COUNTERS] Fortran MEs ( 1 ) : 3.5759s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0349s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4462s + [COUNTERS] Fortran MEs ( 1 ) : 3.5888s for 90112 events => throughput is 2.51E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3335s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8619s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5404s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3215s for 8192 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470791E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.5033s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7676s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7357s for 90112 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3489s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8053s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5436s for 90112 events => throughput is 2.54E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.519174e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.641138e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.514864e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.627463e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3958s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1731s for 8192 events => throughput is 4.73E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5508s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1647s for 8192 events => throughput is 4.97E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.5161s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5979s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9182s for 90112 events => throughput is 4.70E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4428s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6330s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8098s for 90112 events => throughput is 4.98E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.818399e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.058143e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.781347e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.038892e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3971s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0877s for 8192 events => throughput is 9.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3925s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0842s for 8192 events => throughput is 9.73E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4605s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5052s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9553s for 90112 events => throughput is 9.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6361s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9771s for 90112 events => throughput is 9.22E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.701055e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.785412e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.597059e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.879871e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3121s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3975s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5231s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8744s for 90112 events => throughput is 1.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3674s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5551s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8123s for 90112 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.088451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103525e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136073e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4429s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3340s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1088s for 8192 events => throughput is 7.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4279s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0990s for 8192 events => throughput is 8.28E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7475s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5382s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2094s for 90112 events => throughput is 7.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6797s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5881s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0916s for 90112 events => throughput is 8.26E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.536802e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.387308e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.505191e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.236833e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6621s + [COUNTERS] PROGRAM TOTAL : 0.6620s [COUNTERS] Fortran Overhead ( 0 ) : 0.6566s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8669s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8440s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8995s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 90112 events => throughput is 3.97E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655610E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.613696e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.601715e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.322649e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.887746e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.665904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.653087e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.242154e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245673e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.685072e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.662678e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.252410e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.253047e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.680051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.646319e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.765757e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.762775e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index c73e05bddd..e703809b3b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,29 +1,29 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none - -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:37:54 +DATE: 2024-01-26_00:18:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5542s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2274s - [COUNTERS] Fortran MEs ( 1 ) : 0.3268s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5495s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s + [COUNTERS] Fortran MEs ( 1 ) : 0.3261s for 8192 events => throughput is 2.51E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5511s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2267s - [COUNTERS] Fortran MEs ( 1 ) : 0.3244s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5449s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2206s + [COUNTERS] Fortran MEs ( 1 ) : 0.3243s for 8192 events => throughput is 2.53E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0510s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4446s - [COUNTERS] Fortran MEs ( 1 ) : 3.6064s for 90112 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0316s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4517s + [COUNTERS] Fortran MEs ( 1 ) : 3.5800s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8665s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5402s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3263s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5191s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2991s for 8192 events => throughput is 2.74E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349765248158E-002) differ by less than 4E-4 (8.392518791033865e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347758884971E-002) differ by less than 4E-4 (1.0456755794585604e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.2983s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7336s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5648s for 90112 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0612s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7703s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.2909s for 90112 events => throughput is 2.74E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310860767768514E-002) differ by less than 4E-4 (1.3909440088610836e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310858119443913E-002) differ by less than 4E-4 (1.7166476384833373e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.499402e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.800786e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.491631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.824381e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4217s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0970s for 8192 events => throughput is 8.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4071s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0907s for 8192 events => throughput is 9.03E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196334183509370E-002) differ by less than 4E-4 (2.4423714939381114e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196323434217816E-002) differ by less than 4E-4 (3.548307125900152e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6020s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5094s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0926s for 90112 events => throughput is 8.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6363s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6051s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0312s for 90112 events => throughput is 8.74E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310847547651041E-002) differ by less than 4E-4 (3.0168172948652483e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842598054087E-002) differ by less than 4E-4 (3.625542406293647e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.590308e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.146202e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.556597e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.112833e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2706s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0449s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3139s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2705s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196330801117323E-002) differ by less than 4E-4 (2.790367255034454e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9588s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4633s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4955s for 90112 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9925s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5174s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4751s for 90112 events => throughput is 1.90E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310847326088065E-002) differ by less than 4E-4 (3.0440661691333304e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.839831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.939208e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.863334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905454e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3060s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2651s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3016s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2639s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0378s for 8192 events => throughput is 2.17E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196330801117323E-002) differ by less than 4E-4 (2.790367255034454e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8963s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4517s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4446s for 90112 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9295s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5118s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4177s for 90112 events => throughput is 2.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310847326088065E-002) differ by less than 4E-4 (3.0440661691333304e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.073522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.197972e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.038945e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.117265e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3327s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0532s for 8192 events => throughput is 1.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3271s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0499s for 8192 events => throughput is 1.64E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344079460428E-002) differ by less than 4E-4 (1.424231383939656e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344080460087E-002) differ by less than 4E-4 (1.4241285339888776e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0553s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4670s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5882s for 90112 events => throughput is 1.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0855s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5387s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5468s for 90112 events => throughput is 1.65E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310857804286998E-002) differ by less than 4E-4 (1.7554071418679484e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310857813116089E-002) differ by less than 4E-4 (1.754321300451167e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.535514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.649940e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.548771e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.622322e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6530s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6521s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6521s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.75E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366365994E-002) differ by less than 4E-4 (8.802906770188912e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366366022E-002) differ by less than 4E-4 (8.802906736882221e-08) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8988s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8888s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 90112 events => throughput is 8.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9153s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9058s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 90112 events => throughput is 9.47E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310864949473968E-002) differ by less than 4E-4 (8.766578696306482e-08) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310864949473954E-002) differ by less than 4E-4 (8.766578729613173e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.308714e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322135e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.861624e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.860681e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.626812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.641206e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.413554e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.474666e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.631638e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.648721e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.496116e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.520248e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503568e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.509475e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624454e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623003e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b4daea9308..26bc9b342a 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -16,9 +16,9 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:38:32 +DATE: 2024-01-26_00:18:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5629s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2337s - [COUNTERS] Fortran MEs ( 1 ) : 0.3292s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2389s + [COUNTERS] Fortran MEs ( 1 ) : 0.3469s for 8192 events => throughput is 2.36E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5518s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2262s - [COUNTERS] Fortran MEs ( 1 ) : 0.3256s for 8192 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2386s + [COUNTERS] Fortran MEs ( 1 ) : 0.3472s for 8192 events => throughput is 2.36E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0097s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4326s - [COUNTERS] Fortran MEs ( 1 ) : 3.5771s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0544s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4643s + [COUNTERS] Fortran MEs ( 1 ) : 3.5901s for 90112 events => throughput is 2.51E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5586s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3388s for 8192 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8767s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5495s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3272s for 8192 events => throughput is 2.50E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358763382007E-002) differ by less than 2E-4 (8.651674487936134e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358763382021E-002) differ by less than 2E-4 (8.651674487936134e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.5261s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7672s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7589s for 90112 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3981s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7975s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6006s for 90112 events => throughput is 2.50E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.31432020401246e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.314319981967856e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.465437e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.554031e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.473065e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.558500e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1708s for 8192 events => throughput is 4.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5755s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4002s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1753s for 8192 events => throughput is 4.67E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358804670396E-002) differ by less than 2E-4 (9.076468021618211e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358804670424E-002) differ by less than 2E-4 (9.076468243662816e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4683s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5927s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8757s for 90112 events => throughput is 4.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4487s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6335s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8152s for 90112 events => throughput is 4.96E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.336195150311255e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.33619492826665e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.905170e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.133506e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.906590e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.159016e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3954s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0866s for 8192 events => throughput is 9.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0831s for 8192 events => throughput is 9.85E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831846421917476e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4529s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9559s for 90112 events => throughput is 9.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4721s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5556s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9166s for 90112 events => throughput is 9.83E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.722917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006516e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.639767e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.013910e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3758s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0756s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2955s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0733s for 8192 events => throughput is 1.12E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831846421917476e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3291s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8377s for 90112 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3524s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5518s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8005s for 90112 events => throughput is 1.13E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.100825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.156908e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.161672e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4736s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3530s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1206s for 8192 events => throughput is 6.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4316s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3308s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1008s for 8192 events => throughput is 8.13E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358757578441E-002) differ by less than 2E-4 (8.591964695270349e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.8047s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5601s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2446s for 90112 events => throughput is 7.24E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7119s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5851s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1268s for 90112 events => throughput is 8.00E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872803699391E-002) differ by less than 2E-4 (8.929234462939917e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.211134e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.071079e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.316317e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.063980e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6639s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6566s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6512s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981245E-002) differ by less than 2E-4 (1.8571733040317895e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981231E-002) differ by less than 2E-4 (1.8571730819871846e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8539s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 90112 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9224s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 90112 events => throughput is 3.96E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872068634174E-002) differ by less than 2E-4 (1.1094924978749532e-10) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634160E-002) differ by less than 2E-4 (1.109495828544027e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622730e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623843e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.249441e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.126033e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.646288e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.628988e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232584e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.232507e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.632648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.635974e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244267e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.243999e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.616374e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.613847e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720039e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.712232e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 2c14264714..305f74d40c 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -17,15 +17,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:39:14 +DATE: 2024-01-26_00:19:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2862s - [COUNTERS] Fortran MEs ( 1 ) : 4.2205s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2847s + [COUNTERS] Fortran MEs ( 1 ) : 4.1385s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5163s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2798s - [COUNTERS] Fortran MEs ( 1 ) : 4.2365s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4823s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2803s + [COUNTERS] Fortran MEs ( 1 ) : 4.2021s for 8192 events => throughput is 1.95E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.4845s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9328s - [COUNTERS] Fortran MEs ( 1 ) : 46.5516s for 90112 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.7198s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9740s + [COUNTERS] Fortran MEs ( 1 ) : 45.7459s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.8031s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5022s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3008s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.8779s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5273s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3506s for 8192 events => throughput is 1.88E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 53.6260s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1417s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.4843s for 90112 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 54.2755s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1909s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.0846s for 90112 events => throughput is 1.87E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421161E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421150E-004) differ by less than 2E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.955415e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.936301e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957400e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.940526e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.8602s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5384s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3219s for 8192 events => throughput is 3.53E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6979s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4561s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2418s for 8192 events => throughput is 3.65E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.4526s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1609s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.2916s for 90112 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.8152s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1172s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.6980s for 90112 events => throughput is 3.65E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421156E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699774e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.802169e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.692908e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823296e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2393s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2482s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9911s for 8192 events => throughput is 8.27E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2047s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2324s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9723s for 8192 events => throughput is 8.43E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.8246s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9004s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.9241s for 90112 events => throughput is 8.25E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6182s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8737s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7445s for 90112 events => throughput is 8.39E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.470220e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.612716e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.480723e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.609695e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0124s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1378s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8747s for 8192 events => throughput is 9.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9770s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1128s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8641s for 8192 events => throughput is 9.48E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.5302s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7907s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.7395s for 90112 events => throughput is 9.25E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.1580s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7559s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4022s for 90112 events => throughput is 9.58E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.611254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.847095e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.611296e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.837457e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4817s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3731s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1087s for 8192 events => throughput is 7.39E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5594s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4371s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1223s for 8192 events => throughput is 7.30E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.5304s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0362s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.4941s for 90112 events => throughput is 7.21E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.6611s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9896s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.6715s for 90112 events => throughput is 7.72E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.310034e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.812824e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.219164e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.833625e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8067s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7739s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 8192 events => throughput is 2.50E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.8197s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4552s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3644s for 90112 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7969s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3600s for 90112 events => throughput is 2.50E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421161E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421166E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.292727e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283605e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.508942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.518944e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.110003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.111225e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.156098e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.157335e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.100637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.104619e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.153684e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.145376e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.109892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.107697e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.434237e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index dbf6975e6c..c44aa866bb 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:43:32 +DATE: 2024-01-26_00:23:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5240s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s - [COUNTERS] Fortran MEs ( 1 ) : 4.2417s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4312s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] Fortran MEs ( 1 ) : 4.1512s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2792s - [COUNTERS] Fortran MEs ( 1 ) : 4.2274s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4878s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2811s + [COUNTERS] Fortran MEs ( 1 ) : 4.2067s for 8192 events => throughput is 1.95E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.8698s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9436s - [COUNTERS] Fortran MEs ( 1 ) : 46.9262s for 90112 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.5920s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9624s + [COUNTERS] Fortran MEs ( 1 ) : 45.6297s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.6169s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3960s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2209s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.1703s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1746s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9957s for 8192 events => throughput is 2.05E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277396490802749E-004) differ by less than 4E-4 (3.2852368918590003e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396352122325E-004) differ by less than 4E-4 (3.2814141017745158e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 52.7280s - [COUNTERS] Fortran Overhead ( 0 ) : 6.0827s - [COUNTERS] CudaCpp MEs ( 2 ) : 46.6453s for 90112 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.0883s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9292s + [COUNTERS] CudaCpp MEs ( 2 ) : 44.1591s for 90112 events => throughput is 2.04E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803774602344628E-004) differ by less than 4E-4 (3.0912915247593986e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774048965294E-004) differ by less than 4E-4 (3.056275773571926e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.998671e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.109904e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.001275e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.108623e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5615s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4108s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1507s for 8192 events => throughput is 7.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4881s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3708s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1173s for 8192 events => throughput is 7.33E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277389126121586E-004) differ by less than 4E-4 (3.0822260348450925e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277387698033752E-004) differ by less than 4E-4 (3.0428601303089664e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.7555s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0766s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.6789s for 90112 events => throughput is 7.11E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.5148s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0458s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.4690s for 90112 events => throughput is 7.23E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803771887543366E-004) differ by less than 4E-4 (2.9195091675315865e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803770691658365E-004) differ by less than 4E-4 (2.8438380874629132e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.214660e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.588414e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.260665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.664199e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2961s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7839s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5123s for 8192 events => throughput is 1.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.2367s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7493s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4874s for 8192 events => throughput is 1.68E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277390198115864E-004) differ by less than 4E-4 (3.111776055053639e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.1063s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4519s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.6543s for 90112 events => throughput is 1.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.7878s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4039s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.3838s for 90112 events => throughput is 1.67E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803774416711566E-004) differ by less than 4E-4 (3.079545366491132e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649408e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.722525e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.639467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.724823e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.1678s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4500s for 8192 events => throughput is 1.82E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.1297s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6985s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4312s for 8192 events => throughput is 1.90E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277390198115864E-004) differ by less than 4E-4 (3.111776055053639e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.3020s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3817s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.9203s for 90112 events => throughput is 1.83E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.0756s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3360s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7397s for 90112 events => throughput is 1.90E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803774416711566E-004) differ by less than 4E-4 (3.079545366491132e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.885655e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960961e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.876738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.950503e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3886s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5549s for 8192 events => throughput is 1.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7930s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5245s for 8192 events => throughput is 1.56E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277396394633404E-004) differ by less than 4E-4 (3.2825859392904277e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396133530942E-004) differ by less than 4E-4 (3.2753885288450135e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.7321s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5116s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.2205s for 90112 events => throughput is 1.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.2280s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4440s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7840s for 90112 events => throughput is 1.56E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803777741065333E-004) differ by less than 4E-4 (3.2898979009932106e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803777739454609E-004) differ by less than 4E-4 (3.2897959809652377e-06) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.496235e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.578889e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.469765e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.586434e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7802s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7588s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7712s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0213s for 8192 events => throughput is 3.85E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277400478491260E-004) differ by less than 4E-4 (3.3951593780834344e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277400478491265E-004) differ by less than 4E-4 (3.395159378305479e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.6640s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4295s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2346s for 90112 events => throughput is 3.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6112s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3770s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2343s for 90112 events => throughput is 3.85E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.4322117830054566e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.432211783227501e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.580570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.599685e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.934731e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.950677e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.492770e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.500010e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.733354e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626742e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.509140e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.496581e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.727290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.674704e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.483888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.461510e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.531454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.527679e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index cfbc1973b9..f5978616e8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:46:56 +DATE: 2024-01-26_00:27:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5513s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2808s - [COUNTERS] Fortran MEs ( 1 ) : 4.2704s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4106s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2778s + [COUNTERS] Fortran MEs ( 1 ) : 4.1329s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2803s - [COUNTERS] Fortran MEs ( 1 ) : 4.2469s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2748s + [COUNTERS] Fortran MEs ( 1 ) : 4.1162s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.8098s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9495s - [COUNTERS] Fortran MEs ( 1 ) : 46.8603s for 90112 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.7414s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9612s + [COUNTERS] Fortran MEs ( 1 ) : 45.7803s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,13 +134,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.0359s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6142s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4217s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.0398s + [COUNTERS] Fortran Overhead ( 0 ) : 4.6040s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4358s for 8192 events => throughput is 1.85E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277432965013E-004) differ by less than 2E-4 (3.352291999547674e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277432965013E-004) differ by less than 2E-4 (3.352291999547674e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 54.8666s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2546s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.6120s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 55.1214s + [COUNTERS] Fortran Overhead ( 0 ) : 6.2405s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.8809s for 90112 events => throughput is 1.84E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725813026109E-004) differ by less than 2E-4 (4.087956639864387e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725813026107E-004) differ by less than 2E-4 (4.087956861908992e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.917331e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904356e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.930542e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.897196e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7788s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4963s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2825s for 8192 events => throughput is 3.59E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6768s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4567s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2201s for 8192 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277430934464E-004) differ by less than 2E-4 (3.296318995538172e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277430934459E-004) differ by less than 2E-4 (3.296318995538172e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.3040s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1597s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.1443s for 90112 events => throughput is 3.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.5645s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0918s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4727s for 90112 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725816246317E-004) differ by less than 2E-4 (4.291719202242916e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725816246315E-004) differ by less than 2E-4 (4.291719424287521e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.675549e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796653e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694435e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.797971e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2311s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9875s for 8192 events => throughput is 8.30E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2233s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9951s for 8192 events => throughput is 8.23E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861753070292707e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.8690s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9050s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.9640s for 90112 events => throughput is 8.22E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6725s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9431s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7294s for 90112 events => throughput is 8.40E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155535589606e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.536816e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.734061e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.511555e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.702913e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0078s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8741s for 8192 events => throughput is 9.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9431s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0984s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8447s for 8192 events => throughput is 9.70E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861753070292707e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.4367s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7931s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6436s for 90112 events => throughput is 9.34E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.1703s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7526s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4177s for 90112 events => throughput is 9.57E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155535589606e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.658673e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.930286e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.653676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.925319e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5071s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1172s for 8192 events => throughput is 7.33E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4040s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3330s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0710s for 8192 events => throughput is 7.65E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861753070292707e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.4633s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0792s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3840s for 90112 events => throughput is 7.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.7788s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9945s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7843s for 90112 events => throughput is 7.65E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155535589606e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.401661e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.732657e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.428706e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.743920e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8094s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7765s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 8192 events => throughput is 2.49E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277293084707E-004) differ by less than 2E-4 (5.03573627241849e-10) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084701E-004) differ by less than 2E-4 (5.03573627241849e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.8170s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4535s + [COUNTERS] PROGRAM TOTAL : 2.7913s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4278s [COUNTERS] CudaCpp MEs ( 2 ) : 0.3635s for 90112 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131544161291913e-10) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131540830622839e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.293320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.296986e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.531398e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.111282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.106777e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.159178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.144518e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.100756e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.157638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.161493e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.099330e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.110035e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430536e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425529e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 4f16911127..429acdedda 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,12 +1,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -15,10 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -27,13 +28,12 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2023-11-24_15:52:44 +DATE: 2024-01-26_00:32:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 98.5141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4738s - [COUNTERS] Fortran MEs ( 1 ) : 98.0403s for 8192 events => throughput is 8.36E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.5184s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4509s + [COUNTERS] Fortran MEs ( 1 ) : 95.0675s for 8192 events => throughput is 8.62E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 98.4757s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4682s - [COUNTERS] Fortran MEs ( 1 ) : 98.0075s for 8192 events => throughput is 8.36E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.4670s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4447s + [COUNTERS] Fortran MEs ( 1 ) : 95.0223s for 8192 events => throughput is 8.62E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1079.6019s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3004s - [COUNTERS] Fortran MEs ( 1 ) : 1075.3015s for 90112 events => throughput is 8.38E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1051.7230s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1339s + [COUNTERS] Fortran MEs ( 1 ) : 1047.5891s for 90112 events => throughput is 8.60E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,13 +134,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 225.7194s - [COUNTERS] Fortran Overhead ( 0 ) : 104.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 121.1910s for 8192 events => throughput is 6.76E+01 events/s + [COUNTERS] PROGRAM TOTAL : 211.8832s + [COUNTERS] Fortran Overhead ( 0 ) : 96.0026s + [COUNTERS] CudaCpp MEs ( 2 ) : 115.8806s for 8192 events => throughput is 7.07E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1418.6305s - [COUNTERS] Fortran Overhead ( 0 ) : 107.9097s - [COUNTERS] CudaCpp MEs ( 2 ) : 1310.7208s for 90112 events => throughput is 6.87E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1351.3046s + [COUNTERS] Fortran Overhead ( 0 ) : 101.1941s + [COUNTERS] CudaCpp MEs ( 2 ) : 1250.1105s for 90112 events => throughput is 7.21E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007) differ by less than 2E-14 (1.1102230246251565e-15) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813950E-007) differ by less than 2E-14 (1.3322676295501878e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.929728e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.478531e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.986885e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.393754e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 110.1960s - [COUNTERS] Fortran Overhead ( 0 ) : 50.7639s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.4321s for 8192 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.0674s + [COUNTERS] Fortran Overhead ( 0 ) : 49.6711s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.3962s for 8192 events => throughput is 1.43E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (2.220446049250313e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 706.5283s - [COUNTERS] Fortran Overhead ( 0 ) : 54.7734s - [COUNTERS] CudaCpp MEs ( 2 ) : 651.7549s for 90112 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 689.5263s + [COUNTERS] Fortran Overhead ( 0 ) : 53.3356s + [COUNTERS] CudaCpp MEs ( 2 ) : 636.1907s for 90112 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649511e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.669367e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642121e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.654299e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 51.5856s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9016s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.6840s for 8192 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.8853s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2240s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.6613s for 8192 events => throughput is 3.07E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 331.4778s - [COUNTERS] Fortran Overhead ( 0 ) : 27.5520s - [COUNTERS] CudaCpp MEs ( 2 ) : 303.9259s for 90112 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 320.8191s + [COUNTERS] Fortran Overhead ( 0 ) : 26.7578s + [COUNTERS] CudaCpp MEs ( 2 ) : 294.0612s for 90112 events => throughput is 3.06E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538462e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617508e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.559381e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.625262e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 45.7105s - [COUNTERS] Fortran Overhead ( 0 ) : 20.8116s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8989s for 8192 events => throughput is 3.29E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.0733s + [COUNTERS] Fortran Overhead ( 0 ) : 20.2334s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8398s for 8192 events => throughput is 3.44E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 299.0798s - [COUNTERS] Fortran Overhead ( 0 ) : 24.6512s - [COUNTERS] CudaCpp MEs ( 2 ) : 274.4286s for 90112 events => throughput is 3.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 284.5750s + [COUNTERS] Fortran Overhead ( 0 ) : 23.7496s + [COUNTERS] CudaCpp MEs ( 2 ) : 260.8255s for 90112 events => throughput is 3.45E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.987523e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.130335e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.957972e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.157512e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 47.6904s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3628s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.3276s for 8192 events => throughput is 3.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.4574s + [COUNTERS] Fortran Overhead ( 0 ) : 22.3247s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.1327s for 8192 events => throughput is 3.54E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 293.8850s - [COUNTERS] Fortran Overhead ( 0 ) : 27.2969s - [COUNTERS] CudaCpp MEs ( 2 ) : 266.5881s for 90112 events => throughput is 3.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 280.8463s + [COUNTERS] Fortran Overhead ( 0 ) : 25.8922s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.9542s for 90112 events => throughput is 3.53E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.560488e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.776742e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.577427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801039e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 4.2450s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1602s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0848s for 8192 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.1976s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1190s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0786s for 8192 events => throughput is 7.60E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435838E-006) differ by less than 2E-14 (3.1086244689504383e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (1.9984014443252818e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 18.7885s - [COUNTERS] Fortran Overhead ( 0 ) : 6.9034s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.8851s for 90112 events => throughput is 7.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.6950s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7713s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9237s for 90112 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813960E-007) differ by less than 2E-14 (8.881784197001252e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.552834e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.531209e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.253212e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.272145e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.219027e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.199498e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.567176e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.555619e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.273376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.264388e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.442640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.490561e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.208706e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.258329e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.238633e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.243688e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8e8ecf354b..3091cfced1 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none - -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,10 +16,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_17:21:33 +DATE: 2024-01-26_01:58:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 98.7413s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4669s - [COUNTERS] Fortran MEs ( 1 ) : 98.2745s for 8192 events => throughput is 8.34E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.5916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4454s + [COUNTERS] Fortran MEs ( 1 ) : 95.1462s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 98.9213s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4778s - [COUNTERS] Fortran MEs ( 1 ) : 98.4435s for 8192 events => throughput is 8.32E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.5628s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4434s + [COUNTERS] Fortran MEs ( 1 ) : 95.1194s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1080.6656s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3073s - [COUNTERS] Fortran MEs ( 1 ) : 1076.3584s for 90112 events => throughput is 8.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1054.9783s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1232s + [COUNTERS] Fortran MEs ( 1 ) : 1050.8551s for 90112 events => throughput is 8.58E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 202.3759s - [COUNTERS] Fortran Overhead ( 0 ) : 92.2956s - [COUNTERS] CudaCpp MEs ( 2 ) : 110.0803s for 8192 events => throughput is 7.44E+01 events/s + [COUNTERS] PROGRAM TOTAL : 191.8272s + [COUNTERS] Fortran Overhead ( 0 ) : 88.5499s + [COUNTERS] CudaCpp MEs ( 2 ) : 103.2773s for 8192 events => throughput is 7.93E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694768344939596E-006) differ by less than 4E-4 (0.00014259686216466783) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768374083672E-006) differ by less than 4E-4 (0.00014259935458071915) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1304.1851s - [COUNTERS] Fortran Overhead ( 0 ) : 97.1400s - [COUNTERS] CudaCpp MEs ( 2 ) : 1207.0450s for 90112 events => throughput is 7.47E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1227.9199s + [COUNTERS] Fortran Overhead ( 0 ) : 92.2084s + [COUNTERS] CudaCpp MEs ( 2 ) : 1135.7115s for 90112 events => throughput is 7.93E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436150871156E-007) differ by less than 4E-4 (0.00014045934987350073) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435710758843E-007) differ by less than 4E-4 (0.0001404387438554977) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.937347e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.299750e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.901357e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.305787e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.8509s - [COUNTERS] Fortran Overhead ( 0 ) : 24.0533s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.7976s for 8192 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.8237s + [COUNTERS] Fortran Overhead ( 0 ) : 23.0435s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.7802s for 8192 events => throughput is 3.18E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694765850750953E-006) differ by less than 4E-4 (0.00014238355787066226) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694765360831655E-006) differ by less than 4E-4 (0.00014234165972015766) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 321.9393s - [COUNTERS] Fortran Overhead ( 0 ) : 27.9125s - [COUNTERS] CudaCpp MEs ( 2 ) : 294.0268s for 90112 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 309.9872s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8030s + [COUNTERS] CudaCpp MEs ( 2 ) : 283.1842s for 90112 events => throughput is 3.18E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430669586527E-007) differ by less than 4E-4 (0.00014020271663550687) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429212586563E-007) differ by less than 4E-4 (0.00014013450003202976) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.520695e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.631881e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.523057e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.602316e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 26.0191s - [COUNTERS] Fortran Overhead ( 0 ) : 12.1224s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.8967s for 8192 events => throughput is 5.89E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.2458s + [COUNTERS] Fortran Overhead ( 0 ) : 11.9347s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.3111s for 8192 events => throughput is 6.15E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 171.5242s - [COUNTERS] Fortran Overhead ( 0 ) : 15.8508s - [COUNTERS] CudaCpp MEs ( 2 ) : 155.6733s for 90112 events => throughput is 5.79E+02 events/s + [COUNTERS] PROGRAM TOTAL : 162.2911s + [COUNTERS] Fortran Overhead ( 0 ) : 15.4533s + [COUNTERS] CudaCpp MEs ( 2 ) : 146.8378s for 90112 events => throughput is 6.14E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.026908e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.251206e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.019796e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.245377e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 23.1132s - [COUNTERS] Fortran Overhead ( 0 ) : 10.7368s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3764s for 8192 events => throughput is 6.62E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.5039s + [COUNTERS] Fortran Overhead ( 0 ) : 10.4156s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.0883s for 8192 events => throughput is 6.78E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 151.0249s - [COUNTERS] Fortran Overhead ( 0 ) : 14.5293s - [COUNTERS] CudaCpp MEs ( 2 ) : 136.4957s for 90112 events => throughput is 6.60E+02 events/s + [COUNTERS] PROGRAM TOTAL : 146.3643s + [COUNTERS] Fortran Overhead ( 0 ) : 14.1002s + [COUNTERS] CudaCpp MEs ( 2 ) : 132.2641s for 90112 events => throughput is 6.81E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.945001e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.100603e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.958337e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.238263e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 23.9825s - [COUNTERS] Fortran Overhead ( 0 ) : 11.9109s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0717s for 8192 events => throughput is 6.79E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.5776s + [COUNTERS] Fortran Overhead ( 0 ) : 11.2452s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.3325s for 8192 events => throughput is 7.23E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694767957195604E-006) differ by less than 4E-4 (0.00014256370209930758) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768276769753E-006) differ by less than 4E-4 (0.00014259103224434355) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 148.9306s - [COUNTERS] Fortran Overhead ( 0 ) : 15.5819s - [COUNTERS] CudaCpp MEs ( 2 ) : 133.3487s for 90112 events => throughput is 6.76E+02 events/s + [COUNTERS] PROGRAM TOTAL : 140.6031s + [COUNTERS] Fortran Overhead ( 0 ) : 15.1098s + [COUNTERS] CudaCpp MEs ( 2 ) : 125.4933s for 90112 events => throughput is 7.18E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435956349820E-007) differ by less than 4E-4 (0.00014045024240250115) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435948756818E-007) differ by less than 4E-4 (0.00014044988689865257) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.105766e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.608240e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.113491e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.521625e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 2.4884s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9881s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5003s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4649s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9743s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4905s for 8192 events => throughput is 1.67E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694770708195000E-006) differ by less than 4E-4 (0.00014279896898083955) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694770708194997E-006) differ by less than 4E-4 (0.00014279896898039546) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 11.1878s - [COUNTERS] Fortran Overhead ( 0 ) : 5.7450s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4428s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.0984s + [COUNTERS] Fortran Overhead ( 0 ) : 5.5613s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.5371s for 90112 events => throughput is 1.63E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007) differ by less than 4E-4 (0.0001408023850304474) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361443477565656E-007) differ by less than 4E-4 (0.00014080238503022535) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635622e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.642284e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625946e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624354e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.339621e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.288681e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.381945e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.340792e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.320446e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.346593e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.325431e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.342416e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300828e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.314317e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.393528e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.422033e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 963e0ec416..63e196188b 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2023-11-24_18:28:54 +DATE: 2024-01-26_03:02:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 98.6622s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4669s - [COUNTERS] Fortran MEs ( 1 ) : 98.1953s for 8192 events => throughput is 8.34E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.5897s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4484s + [COUNTERS] Fortran MEs ( 1 ) : 95.1413s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 98.6106s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4686s - [COUNTERS] Fortran MEs ( 1 ) : 98.1420s for 8192 events => throughput is 8.35E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.9550s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4482s + [COUNTERS] Fortran MEs ( 1 ) : 95.5068s for 8192 events => throughput is 8.58E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1080.6388s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2788s - [COUNTERS] Fortran MEs ( 1 ) : 1076.3600s for 90112 events => throughput is 8.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1052.1674s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1033s + [COUNTERS] Fortran MEs ( 1 ) : 1048.0641s for 90112 events => throughput is 8.60E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 227.6225s - [COUNTERS] Fortran Overhead ( 0 ) : 105.5451s - [COUNTERS] CudaCpp MEs ( 2 ) : 122.0773s for 8192 events => throughput is 6.71E+01 events/s + [COUNTERS] PROGRAM TOTAL : 211.1386s + [COUNTERS] Fortran Overhead ( 0 ) : 97.2926s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.8460s for 8192 events => throughput is 7.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101016896846E-006) differ by less than 2E-4 (6.111385175699979e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101016896844E-006) differ by less than 2E-4 (6.1113847316107694e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1457.2560s - [COUNTERS] Fortran Overhead ( 0 ) : 110.3750s - [COUNTERS] CudaCpp MEs ( 2 ) : 1346.8810s for 90112 events => throughput is 6.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1351.7559s + [COUNTERS] Fortran Overhead ( 0 ) : 100.8234s + [COUNTERS] CudaCpp MEs ( 2 ) : 1250.9325s for 90112 events => throughput is 7.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.788273e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.502446e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.741929e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.479145e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,13 +210,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 115.6749s - [COUNTERS] Fortran Overhead ( 0 ) : 52.7434s - [COUNTERS] CudaCpp MEs ( 2 ) : 62.9314s for 8192 events => throughput is 1.30E+02 events/s + [COUNTERS] PROGRAM TOTAL : 109.1320s + [COUNTERS] Fortran Overhead ( 0 ) : 50.5216s + [COUNTERS] CudaCpp MEs ( 2 ) : 58.6104s for 8192 events => throughput is 1.40E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658807442115e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658363352905e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 738.7127s - [COUNTERS] Fortran Overhead ( 0 ) : 56.7745s - [COUNTERS] CudaCpp MEs ( 2 ) : 681.9382s for 90112 events => throughput is 1.32E+02 events/s + [COUNTERS] PROGRAM TOTAL : 701.1097s + [COUNTERS] Fortran Overhead ( 0 ) : 54.2795s + [COUNTERS] CudaCpp MEs ( 2 ) : 646.8302s for 90112 events => throughput is 1.39E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007) differ by less than 2E-4 (5.866422903011426e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436284111587E-007) differ by less than 2E-4 (5.866422458922216e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.576187e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.636656e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.573070e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635010e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 49.9825s - [COUNTERS] Fortran Overhead ( 0 ) : 22.7532s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.2294s for 8192 events => throughput is 3.01E+02 events/s + [COUNTERS] PROGRAM TOTAL : 47.2278s + [COUNTERS] Fortran Overhead ( 0 ) : 21.8795s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.3483s for 8192 events => throughput is 3.23E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 330.1981s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s - [COUNTERS] CudaCpp MEs ( 2 ) : 303.3957s for 90112 events => throughput is 2.97E+02 events/s + [COUNTERS] PROGRAM TOTAL : 308.5589s + [COUNTERS] Fortran Overhead ( 0 ) : 25.6558s + [COUNTERS] CudaCpp MEs ( 2 ) : 282.9030s for 90112 events => throughput is 3.19E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.700163e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.824293e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.716172e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.845973e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 43.9707s - [COUNTERS] Fortran Overhead ( 0 ) : 19.9051s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.0656s for 8192 events => throughput is 3.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.5262s + [COUNTERS] Fortran Overhead ( 0 ) : 19.1736s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.3526s for 8192 events => throughput is 3.51E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 289.2968s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7412s - [COUNTERS] CudaCpp MEs ( 2 ) : 265.5556s for 90112 events => throughput is 3.39E+02 events/s + [COUNTERS] PROGRAM TOTAL : 276.3285s + [COUNTERS] Fortran Overhead ( 0 ) : 22.8039s + [COUNTERS] CudaCpp MEs ( 2 ) : 253.5246s for 90112 events => throughput is 3.55E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.209510e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.315827e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.226810e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.342660e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 46.6653s - [COUNTERS] Fortran Overhead ( 0 ) : 22.7630s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.9023s for 8192 events => throughput is 3.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.1656s + [COUNTERS] Fortran Overhead ( 0 ) : 21.4926s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.6730s for 8192 events => throughput is 3.61E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 296.3640s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8877s - [COUNTERS] CudaCpp MEs ( 2 ) : 269.4763s for 90112 events => throughput is 3.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 271.5361s + [COUNTERS] Fortran Overhead ( 0 ) : 25.1621s + [COUNTERS] CudaCpp MEs ( 2 ) : 246.3740s for 90112 events => throughput is 3.66E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.629406e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.884301e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.640208e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.891512e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 3.5613s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8663s for 8192 events => throughput is 9.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5335s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8612s for 8192 events => throughput is 9.51E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.2792201459509442e-10) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.279223476620018e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 15.9559s - [COUNTERS] Fortran Overhead ( 0 ) : 6.4625s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4934s for 90112 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.7822s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3002s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4820s for 90112 events => throughput is 9.50E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173705990875078e-11) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173717093105324e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.436689e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.442598e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080384e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.084385e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108886e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111822e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161919e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.156542e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110917e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.109641e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113389e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111246e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108659e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.656419e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index f27ee93a59..18a212f5c3 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:51:16 +DATE: 2024-01-26_00:31:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2461s - [COUNTERS] Fortran MEs ( 1 ) : 0.0708s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3131s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2435s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3076s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2367s - [COUNTERS] Fortran MEs ( 1 ) : 0.0710s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3035s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2330s + [COUNTERS] Fortran MEs ( 1 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2394s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4635s - [COUNTERS] Fortran MEs ( 1 ) : 0.7759s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2442s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4822s + [COUNTERS] Fortran MEs ( 1 ) : 0.7620s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3933s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0766s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3921s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3155s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0767s for 8192 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703716) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703710) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4082s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5618s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8464s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4324s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8472s for 90112 events => throughput is 1.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.081037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071399e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068727e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2869s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3172s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2776s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0397s for 8192 events => throughput is 2.07E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0022s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4630s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9768s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4360s for 90112 events => throughput is 2.07E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615872) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.967710e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.067409e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.951729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.048112e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2892s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2650s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 8192 events => throughput is 3.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2851s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2615s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7692s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5041s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2652s for 90112 events => throughput is 3.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7839s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5289s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2550s for 90112 events => throughput is 3.53E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.443138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.399317e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.417632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.503492e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2793s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2587s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.99E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7686s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5286s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2400s for 90112 events => throughput is 3.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7452s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2227s for 90112 events => throughput is 4.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.729118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.854052e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.747676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.940709e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3058s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2735s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0323s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3198s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2874s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8682s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3559s for 90112 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8630s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5384s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3247s for 90112 events => throughput is 2.78E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.352575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.752350e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.365058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.723924e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6681s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6674s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6671s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6665s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9647s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9564s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 90112 events => throughput is 1.09E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9360s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9284s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.19E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615869) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568354e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.551656e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.005124e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.992280e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387855e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.371489e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.497397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.506906e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.373048e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.393433e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.771015e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.788279e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382200e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.383321e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.783427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.769455e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index f629b5c150..f81f538e39 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:51:46 +DATE: 2024-01-26_00:31:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3117s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2399s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3054s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2364s + [COUNTERS] Fortran MEs ( 1 ) : 0.0691s for 8192 events => throughput is 1.19E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2384s - [COUNTERS] Fortran MEs ( 1 ) : 0.0715s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3002s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2314s + [COUNTERS] Fortran MEs ( 1 ) : 0.0688s for 8192 events => throughput is 1.19E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2455s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4617s - [COUNTERS] Fortran MEs ( 1 ) : 0.7838s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2344s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4729s + [COUNTERS] Fortran MEs ( 1 ) : 0.7615s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3899s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3166s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0733s for 8192 events => throughput is 1.12E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050316058770007) differ by less than 4E-4 (6.622154696822591e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050314903825744) differ by less than 4E-4 (7.065505747139156e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5619s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8065s for 90112 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3620s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6003s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7617s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182797520666) differ by less than 4E-4 (6.830124466006282e-09) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801181770186087) differ by less than 4E-4 (4.0292758352045155e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.132102e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201826e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.140766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203566e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2662s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 8192 events => throughput is 3.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2875s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2639s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313133963987) differ by less than 4E-4 (7.744906558304621e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310835231938) differ by less than 4E-4 (8.627325996934943e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7892s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5021s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2870s for 90112 events => throughput is 3.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7927s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5347s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2579s for 90112 events => throughput is 3.49E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801179276862181) differ by less than 4E-4 (1.5465921032742358e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177817838580) differ by less than 4E-4 (2.2158326773435988e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.189560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501400e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.133717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487643e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2508s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2644s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2519s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313344346482) differ by less than 4E-4 (7.664146557395668e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6261s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4856s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1405s for 90112 events => throughput is 6.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7527s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1430s for 90112 events => throughput is 6.30E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801179137376883) differ by less than 4E-4 (1.6105727140836024e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.270432e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.944046e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.321323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.933575e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2611s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2582s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0117s for 8192 events => throughput is 6.98E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313344346482) differ by less than 4E-4 (7.664146557395668e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6570s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5241s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1330s for 90112 events => throughput is 6.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6415s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1215s for 90112 events => throughput is 7.42E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801179137376883) differ by less than 4E-4 (1.6105727140836024e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.825162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.170126e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.838486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.259977e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2742s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2582s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2547s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0152s for 8192 events => throughput is 5.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7047s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1853s for 90112 events => throughput is 4.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6930s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5258s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1672s for 90112 events => throughput is 5.39E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674380421517e-08) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674269399215e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.717607e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.209904e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.789235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.116271e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.54E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6651s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.56E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9216s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9408s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9350s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.688652e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.849520e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.419660e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.485887e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.806306e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.841126e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722148e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.716684e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.808789e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.852475e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.809471e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.797125e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.332763e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.386392e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.066283e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948163e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 8fb8683f4e..655570da22 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,9 +1,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -16,12 +16,12 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:52:14 +DATE: 2024-01-26_00:32:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2397s - [COUNTERS] Fortran MEs ( 1 ) : 0.0713s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3048s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2353s + [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3123s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2407s - [COUNTERS] Fortran MEs ( 1 ) : 0.0716s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3071s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2376s + [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2374s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4535s - [COUNTERS] Fortran MEs ( 1 ) : 0.7839s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2412s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4768s + [COUNTERS] Fortran MEs ( 1 ) : 0.7644s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3957s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0780s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3930s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3153s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0776s for 8192 events => throughput is 1.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4134s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5610s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8524s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4382s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8557s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182636608796) differ by less than 2E-4 (5.507535538740171e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608801) differ by less than 2E-4 (5.507531097848073e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.065790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.063006e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.078999e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.063531e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3246s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2839s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0407s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2775s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0392s for 8192 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657201) differ by less than 2E-4 (1.0382406046005599e-09) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657212) differ by less than 2E-4 (1.0382402715336525e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9711s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9800s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5472s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4327s for 90112 events => throughput is 2.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182636608810) differ by less than 2E-4 (5.507528877402024e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608804) differ by less than 2E-4 (5.507529987625048e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.994361e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.019971e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.977417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.997392e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2640s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2625s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.51E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7622s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4988s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2634s for 90112 events => throughput is 3.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2516s for 90112 events => throughput is 3.58E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557156874085422e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.380399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.589769e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.457644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.540136e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2612s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0213s for 8192 events => throughput is 3.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2781s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7310s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2318s for 90112 events => throughput is 3.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7435s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5265s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2170s for 90112 events => throughput is 4.15E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557156874085422e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.844575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065243e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.903023e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.047006e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3081s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2998s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2695s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0303s for 8192 events => throughput is 2.71E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8866s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3706s for 90112 events => throughput is 2.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8798s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5454s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3344s for 90112 events => throughput is 2.69E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557156874085422e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.345187e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.636835e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.413954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.671197e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6718s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6635s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029693) differ by less than 2E-4 (3.329716502520341e-10) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029699) differ by less than 2E-4 (3.329714282074292e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9585s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9503s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 90112 events => throughput is 1.10E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9357s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182637219935) differ by less than 2E-4 (5.227211996583492e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182637219937) differ by less than 2E-4 (5.227208665914418e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.574709e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.571736e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.987655e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159114e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382555e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.394463e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.503146e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.528264e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.367209e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.383492e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.807426e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.837843e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.377892e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.384878e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.781332e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.781274e+07 ) sec^-1 TEST COMPLETED From 8698762cd1a1ed1614b0a81d9e6399b50da43074 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 19:16:12 +0200 Subject: [PATCH 46/96] [jt774] in gg_tt.sa check_sa.cc, disable the printout of the gcc toolchain in clang as this relies on the value of the $CXX environment variable at runtime (On LUMI using hipcc, CXX is not defined at all and this crashes with a segfault) --- .../gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index e7dbb05570..c51f01c456 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -571,6 +571,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -584,6 +588,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; From 3e473afaf2a5c27fc4a99ba8058841af2ef50c47 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Fri, 26 Jan 2024 19:38:06 +0200 Subject: [PATCH 47/96] [jt774] in tput/throughputX.sh, add support for AMD GPUs (still assuming that there is only one GPU, either NVidia or AMD) --- epochX/cudacpp/tput/throughputX.sh | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 8160f7fbb9..1e5b427b1f 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -523,6 +523,7 @@ function cmpExe() { # Profile #registers and %divergence only function runNcu() { + if ! ncu -v > /dev/null 2>&1; then return; fi if [ "${maketype}" == "-dryrun" ]; then return; fi exe=$1 args="$2" @@ -545,6 +546,7 @@ function runNcu() { # See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/branchstatistics.htm # See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/sourcelevel/divergentbranch.htm function runNcuDiv() { + if ! ncu -v > /dev/null 2>&1; then return; fi if [ "${maketype}" == "-dryrun" ]; then return; fi exe=$1 args="-p 1 32 1" @@ -567,6 +569,7 @@ function runNcuDiv() { # Profiles sectors and requests function runNcuReq() { + if ! ncu -v > /dev/null 2>&1; then return; fi if [ "${maketype}" == "-dryrun" ]; then return; fi exe=$1 ncuArgs="$2" @@ -580,7 +583,13 @@ function runNcuReq() { set +x } -if nvidia-smi -L > /dev/null 2>&1; then gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)"; else gpuTxt=none; fi +if nvidia-smi -L > /dev/null 2>&1; then + gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)" +elif rocm-smi -i > /dev/null 2>&1; then + gpuTxt="$(rocm-smi --showproductname | grep 'Card series' | awk '{print $5,$6,$7}')" +else + gpuTxt=none +fi if [ "${unames}" == "Darwin" ]; then cpuTxt=$(sysctl -h machdep.cpu.brand_string) cpuTxt=${cpuTxt/machdep.cpu.brand_string: } From d13493fded57132e4835b7c99c1dbda726e82884 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 11:02:43 +0100 Subject: [PATCH 48/96] [jt774] in gg_tt.sa, bypass std::filesystem completely to ease portability on LUMI #803 (and undo previous changes to address this issue in __HIPCC__) --- epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h | 17 ++++++++++++----- .../gg_tt.sa/SubProcesses/MadgraphTest.h | 13 ++++++++----- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 17 ++++++++++------- epochX/cudacpp/gg_tt.sa/src/read_slha.cc | 13 ++++++++----- 4 files changed, 38 insertions(+), 22 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 85dcf2763e..7f25eea05c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -18,13 +18,14 @@ #include #include #include -#ifdef __HIPCC__ -#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -#else -#include -#endif +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include +#include // bypass std::filesystem #803 #include #ifdef MGONGPUCPP_GPUIMPL @@ -259,11 +260,17 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; + /* #ifdef __HIPCC__ if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; #else if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; #endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ){ struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h index 972ef2d4a6..6054185300 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h @@ -14,11 +14,11 @@ #include #include -#ifdef __HIPCC__ -#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -#else -#include -#endif +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -223,11 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* #ifdef __HIPCC__ const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); #else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); #endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 43453cf816..4b8ff31256 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -651,11 +651,13 @@ ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) -ifneq ($(findstring hipcc,$(GPUCC)),) - $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs -else $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -783,9 +785,10 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifneq ($(findstring hipcc,$(GPUCC)),) -$(testmain): LIBFLAGS += -lstdc++fs -endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH diff --git a/epochX/cudacpp/gg_tt.sa/src/read_slha.cc b/epochX/cudacpp/gg_tt.sa/src/read_slha.cc index 5aa08bb503..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_tt.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt.sa/src/read_slha.cc @@ -11,11 +11,11 @@ #include #include -#ifdef __HIPCC__ -#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 -#else -#include -#endif +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -64,11 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* #ifdef __HIPCC__ const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); #else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); #endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { From 9f3e3b501a3caab4b34e70a8ecbea188812ca55f Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 13:07:55 +0200 Subject: [PATCH 49/96] [jt774] first tput tee test on LUMI with AMD GPU (using ggtt.sa): here fgcheck.exe fails because of link issues #802 Note, the log was created with sa but copied as mad \cp tput/logs_ggtt_sa/log_ggtt_sa_d_inl0_hrd0.txt tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt LUMI setup is the following module load cray-python module load gcc/12.2.0 export FC=`which gfortran` export PATH=~/CCACHE/ccache-4.8.2-INSTALL/bin:$PATH export CCACHE_DIR=~/CCACHE/ccache export USECCACHE=1 --- .../log_ggtt_mad_d_inl0_hrd0.txt | 313 +++++++++--------- 1 file changed, 161 insertions(+), 152 deletions(-) diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index c48e4a575b..7cf1126960 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,209 +1,218 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx OMPFLAGS=-fopenmp -AVX=512y +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +OMPFLAGS=-fopenmp +AVX=avx2 +FPTYPE=d +HELINL=0 +HRDCOD=0 +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) -make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 AVX=none -f makefile +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +OMPFLAGS=-fopenmp +AVX=none +FPTYPE=d +HELINL=0 +HRDCOD=0 +RNDGEN=hasNoCurand +Building in BUILDDIR=build.none_d_inl0_hrd0 for tag=none_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +make[1]: Nothing to be done for 'all.none_d_inl0_hrd0_hasNoCurand'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 AVX=sse4 -f makefile +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +OMPFLAGS=-fopenmp +AVX=sse4 +FPTYPE=d +HELINL=0 +HRDCOD=0 +RNDGEN=hasNoCurand +Building in BUILDDIR=build.sse4_d_inl0_hrd0 for tag=sse4_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +make[1]: Nothing to be done for 'all.sse4_d_inl0_hrd0_hasNoCurand'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 AVX=avx2 -f makefile +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +OMPFLAGS=-fopenmp +AVX=avx2 +FPTYPE=d +HELINL=0 +HRDCOD=0 +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +make[1]: Nothing to be done for 'all.avx2_d_inl0_hrd0_hasNoCurand'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 AVX=512y -f makefile +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +OMPFLAGS=-fopenmp +AVX=512y +FPTYPE=d +HELINL=0 +HRDCOD=0 +RNDGEN=hasNoCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +make[1]: Nothing to be done for 'all.512y_d_inl0_hrd0_hasNoCurand'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 AVX=512z -f makefile +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +OMPFLAGS=-fopenmp +AVX=512z +FPTYPE=d +HELINL=0 +HRDCOD=0 +RNDGEN=hasNoCurand +Building in BUILDDIR=build.512z_d_inl0_hrd0 for tag=512z_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +make[1]: Nothing to be done for 'all.512z_d_inl0_hrd0_hasNoCurand'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -DATE: 2024-01-25_23:03:53 +DATE: 2024-01-27_13:05:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.029909e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136710e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272780e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530683 sec - 2,290,171,294 cycles # 2.993 GHz - 3,254,195,766 instructions # 1.42 insn per cycle - 0.851756610 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.454193e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.299845e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.337683e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.406467 sec + 3,209,758,603 cycles:u # 1.942 GHz (75.08%) + 10,797,490 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.43%) + 1,145,522,431 stalled-cycles-backend:u # 35.69% backend cycles idle (75.44%) + 2,964,345,887 instructions:u # 0.92 insn per cycle + # 0.39 stalled cycles per insn (74.97%) + 2.502029972 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 + +Program received signal SIGSEGV: Segmentation fault - invalid memory reference. + +Backtrace for this error: +#0 0x14bd22205640 in ??? +#1 0x14bd22204873 in ??? +#2 0x14bd20cb8dbf in ??? +#3 0x14bd22378460 in ??? +#4 0x14bd22378b64 in ??? +#5 0x14bd223768bf in ??? +#6 0x20c597 in ??? +#7 0x20cd28 in ??? +#8 0x14bd20ca324c in ??? +#9 0x20c3e9 in _start + at ../sysdeps/x86_64/start.S:120 +#10 0xffffffffffffffff in ??? Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 -OK (relative difference <= 5E-3) +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.196291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.261557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.261557e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.882033 sec - 14,995,294,473 cycles # 3.069 GHz - 38,723,990,063 instructions # 2.58 insn per cycle - 4.896002071 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 128 +EvtsPerSec[Rmb+ME] (23) = ( 2.517204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.581638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.581638e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.356178 sec + 14,971,376,763 cycles:u # 3.415 GHz (75.01%) + 9,932,887 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.02%) + 18,327,996 stalled-cycles-backend:u # 0.12% backend cycles idle (75.01%) + 38,344,389,999 instructions:u # 2.56 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 4.400603666 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.650142e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.858052e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.858052e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.980112 sec - 8,977,221,889 cycles # 3.014 GHz - 24,433,177,409 instructions # 2.72 insn per cycle - 2.999214008 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 128 +EvtsPerSec[Rmb+ME] (23) = ( 4.504452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731202e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731202e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.509906 sec + 8,575,359,610 cycles:u # 3.372 GHz (74.77%) + 9,992,485 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.70%) + 450,729,372 stalled-cycles-backend:u # 5.26% backend cycles idle (74.98%) + 24,152,560,008 instructions:u # 2.82 insn per cycle + # 0.02 stalled cycles per insn (75.14%) + 2.557038048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.863210e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.372035e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.372035e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.893453 sec - 5,544,307,869 cycles # 2.920 GHz - 11,562,341,019 instructions # 2.09 insn per cycle - 1.911685121 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.821801e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.520956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.520956e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.645226 sec - 4,816,104,732 cycles # 2.920 GHz - 10,339,259,007 instructions # 2.15 insn per cycle - 1.663772123 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +OMP threads / `nproc --all` = 1 / 128 +EvtsPerSec[Rmb+ME] (23) = ( 7.752184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.358565e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.358565e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.534936 sec + 5,135,515,247 cycles:u # 3.275 GHz (75.00%) + 8,998,284 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.06%) + 1,024,010,156 stalled-cycles-backend:u # 19.94% backend cycles idle (75.00%) + 11,340,162,513 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.02%) + 1.576184190 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.484238e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.775134e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.775134e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.443398 sec - 4,951,976,924 cycles # 2.022 GHz - 7,554,721,580 instructions # 1.53 insn per cycle - 2.461242743 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED From 1185c97ea24e8c06f6d44e548c5895822eb44552 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 12:55:10 +0200 Subject: [PATCH 50/96] [jt774] in gg_tt.sa cudacpp.mk, use FC to link fgcheck.exe, gcheck.exe, runTest.exe when hipcc is used #802 --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 4b8ff31256..413b38a3ee 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -707,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -719,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=hipcc))/../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -797,8 +807,12 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=hipcc))/../lib -lamdhip64 +else $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif +endif # Use target gtestlibs to build only googletest ifneq ($(GTESTLIBS),) From 0943ae114891c0dc6c1329dcba99ac8d18bd3dc3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 13:20:13 +0200 Subject: [PATCH 51/96] [jt774] in gg_tt.sa cudacpp.mk, improve resolution of path to libamdhip64 for #802 (hipcc may be in /usr/bin/hip or /opt/rocm/bin/hipcc but I must use /opt/rocm/5.2.3/lib: therefore, determine this from the path to clang which is in /opt/rocm/5.2.3/llvm/bin/clang). Note: this fails the link using gfortran. This setup relies on using consistent hipcc, clang, flang from /opt/rocm. --- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 413b38a3ee..1414661db4 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -726,7 +726,7 @@ endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=hipcc))/../lib -lamdhip64 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif @@ -808,7 +808,7 @@ else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=hipcc))/../lib -lamdhip64 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif From cbf3d36d447271caec6bbcca2c89f3543139a0c5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 13:27:35 +0200 Subject: [PATCH 52/96] [jt774] second (first successful) tput tee test on LUMI with AMD GPU (using ggtt.sa), after fixing fgcheck.exe link issues #802 Note, the log was created with sa but copied as mad \cp tput/logs_ggtt_sa/log_ggtt_sa_d_inl0_hrd0.txt tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt LUMI setup is now the following, using clang and flang module load cray-python export CXX=/opt/rocm-5.2.3/llvm/bin/clang++ export CC=/opt/rocm-5.2.3/llvm/bin/clang export FC=/opt/rocm-5.2.3/llvm/bin/flang export PATH=~/CCACHE/ccache-4.8.2-INSTALL/bin:$PATH export CCACHE_DIR=~/CCACHE/ccache export USECCACHE=1 Note, as in previous tests, HIP_HOME is automatically determined as /usr (hipcc is in /usr/bin/hipcc) --- .../log_ggtt_mad_d_inl0_hrd0.txt | 150 ++++++++---------- 1 file changed, 66 insertions(+), 84 deletions(-) diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 7cf1126960..6cb59cf783 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,7 +1,7 @@ export CUDACPP_RUNTIME_ENABLEFPE=on Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=avx2 FPTYPE=d HELINL=0 @@ -9,7 +9,7 @@ HRDCOD=0 RNDGEN=hasNoCurand Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=avx2 FPTYPE=d HELINL=0 @@ -19,7 +19,7 @@ Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand make USEBUILDDIR=1 AVX=none -f makefile make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=none FPTYPE=d HELINL=0 @@ -31,7 +31,7 @@ make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/e make USEBUILDDIR=1 AVX=sse4 -f makefile make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=sse4 FPTYPE=d HELINL=0 @@ -43,7 +43,7 @@ make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/e make USEBUILDDIR=1 AVX=avx2 -f makefile make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=avx2 FPTYPE=d HELINL=0 @@ -55,7 +55,7 @@ make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/e make USEBUILDDIR=1 AVX=512y -f makefile make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=512y FPTYPE=d HELINL=0 @@ -67,7 +67,7 @@ make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/e make USEBUILDDIR=1 AVX=512z -f makefile make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS=-fopenmp +OMPFLAGS= AVX=512z FPTYPE=d HELINL=0 @@ -77,7 +77,7 @@ Building in BUILDDIR=build.512z_d_inl0_hrd0 for tag=512z_d_inl0_hrd0_hasNoCurand make[1]: Nothing to be done for 'all.512z_d_inl0_hrd0_hasNoCurand'. make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -DATE: 2024-01-27_13:05:09 +DATE: 2024-01-27_13:26:29 On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= @@ -86,59 +86,43 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.454193e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.299845e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.337683e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.439996e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.279520e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.316726e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.406467 sec - 3,209,758,603 cycles:u # 1.942 GHz (75.08%) - 10,797,490 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.43%) - 1,145,522,431 stalled-cycles-backend:u # 35.69% backend cycles idle (75.44%) - 2,964,345,887 instructions:u # 0.92 insn per cycle - # 0.39 stalled cycles per insn (74.97%) - 2.502029972 seconds time elapsed +TOTAL : 1.563399 sec + 3,228,384,880 cycles:u # 1.904 GHz (75.56%) + 11,952,819 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.66%) + 1,155,234,954 stalled-cycles-backend:u # 35.78% backend cycles idle (74.40%) + 3,085,212,880 instructions:u # 0.96 insn per cycle + # 0.37 stalled cycles per insn (74.30%) + 2.562005892 seconds time elapsed ------------------------------------------------------------------------- cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 - -Program received signal SIGSEGV: Segmentation fault - invalid memory reference. - -Backtrace for this error: -#0 0x14bd22205640 in ??? -#1 0x14bd22204873 in ??? -#2 0x14bd20cb8dbf in ??? -#3 0x14bd22378460 in ??? -#4 0x14bd22378b64 in ??? -#5 0x14bd223768bf in ??? -#6 0x20c597 in ??? -#7 0x20cd28 in ??? -#8 0x14bd20ca324c in ??? -#9 0x20c3e9 in _start - at ../sysdeps/x86_64/start.S:120 -#10 0xffffffffffffffff in ??? Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +Avg ME (F77/CUDA) = 2.028806338851682 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 2.517204e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.581638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.581638e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.506702e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.569666e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.569666e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.356178 sec - 14,971,376,763 cycles:u # 3.415 GHz (75.01%) - 9,932,887 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.02%) - 18,327,996 stalled-cycles-backend:u # 0.12% backend cycles idle (75.01%) - 38,344,389,999 instructions:u # 2.56 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 4.400603666 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.361164 sec + 15,062,604,939 cycles:u # 3.428 GHz (74.91%) + 7,266,330 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) + 3,949,052,022 stalled-cycles-backend:u # 26.22% backend cycles idle (75.06%) + 42,698,627,244 instructions:u # 2.83 insn per cycle + # 0.09 stalled cycles per insn (74.98%) + 4.416609826 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 842) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -146,29 +130,28 @@ runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesse cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515649 +Avg ME (F77/C++) = 2.028806338851565 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+NOVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 4.504452e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731202e+05 ) sec^-1 +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=NO] +EvtsPerSec[Rmb+ME] (23) = ( 4.171928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.352658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.352658e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.509906 sec - 8,575,359,610 cycles:u # 3.372 GHz (74.77%) - 9,992,485 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.70%) - 450,729,372 stalled-cycles-backend:u # 5.26% backend cycles idle (74.98%) - 24,152,560,008 instructions:u # 2.82 insn per cycle - # 0.02 stalled cycles per insn (75.14%) - 2.557038048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2009) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.691869 sec + 9,161,333,356 cycles:u # 3.363 GHz (75.06%) + 6,574,969 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.05%) + 2,444,591,382 stalled-cycles-backend:u # 26.68% backend cycles idle (75.06%) + 26,129,370,357 instructions:u # 2.85 insn per cycle + # 0.09 stalled cycles per insn (74.93%) + 2.734076715 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2666) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,29 +159,28 @@ runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesse cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515654 -Relative difference = 3.2588039900609506e-07 +Avg ME (F77/C++) = 2.028806338851565 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+NOVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 128 -EvtsPerSec[Rmb+ME] (23) = ( 7.752184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.358565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.358565e+05 ) sec^-1 +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=NO] +EvtsPerSec[Rmb+ME] (23) = ( 6.857588e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.309976e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.309976e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.534936 sec - 5,135,515,247 cycles:u # 3.275 GHz (75.00%) - 8,998,284 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.06%) - 1,024,010,156 stalled-cycles-backend:u # 19.94% backend cycles idle (75.00%) - 11,340,162,513 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.02%) - 1.576184190 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 0) (512z: 0) +TOTAL : 1.708082 sec + 5,760,765,780 cycles:u # 3.307 GHz (74.77%) + 6,878,837 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.83%) + 1,175,203,447 stalled-cycles-backend:u # 20.40% backend cycles idle (75.02%) + 12,437,140,071 instructions:u # 2.16 insn per cycle + # 0.09 stalled cycles per insn (75.21%) + 1.761202524 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2696) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -206,8 +188,8 @@ runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesse cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 +Avg ME (F77/C++) = 2.028806338851566 +Relative difference = 3.2588039878720327e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) From 5c27ed64ed7bd9ed37e439aac23284082c06e759 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 12:36:55 +0100 Subject: [PATCH 53/96] [jt774] backport AMD HIP changes for PR #801 from gg_tt.sa to CODEGEN Details - in cudacpp.mk, use FC to link fgcheck.exe, gcheck.exe, runTest.exe when hipcc is used on LUMI #802 - in three .h/.cc files, bypass std::filesystem completely to ease portability on LUMI #803 (and undo previous changes to address this issue in __HIPCC__ and linking libstdc++fs explicitly) - in check_sa.cc, disable the printout of the gcc toolchain in clang as this relies on the value of the $CXX environment variable at runtime - move back hip_runtime.h from GpuAsbtraction.h to mgOnGpuConfig.h (needed for blockDim, blockIdx, threadIdx) - in check_sa.cc, replace some __CUDACC__ by MGONGPUCPP_GPUIMPL (in code added after Jorgen's work) - in cudacpp.mk, fix autodiscovery of HIP_HOME from 'which hipcc' on LUMI --- .../iolibs/template_files/gpu/Bridge.h | 22 ++++++++++--- .../template_files/gpu/GpuAbstraction.h | 2 -- .../iolibs/template_files/gpu/MadgraphTest.h | 13 +++++++- .../iolibs/template_files/gpu/check_sa.cc | 4 +-- .../iolibs/template_files/gpu/cudacpp.mk | 33 ++++++++++++++++--- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 3 +- .../gpu/process_function_definitions.inc | 6 ++++ .../iolibs/template_files/read_slha.cc | 13 +++++++- 8 files changed, 80 insertions(+), 16 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 89437b4c42..7f25eea05c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -18,9 +18,14 @@ #include #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include +#include // bypass std::filesystem #803 #include #ifdef MGONGPUCPP_GPUIMPL @@ -255,10 +260,17 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ){ struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index dbca8e330f..fe6cb442da 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %%/bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %%bin/hipcc,%%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %%/bin/hipcc,%%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 46a8f0efc0..d16a94431d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -8,13 +8,14 @@ // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) -%(mgongpu_supports_multichannel)s +#undef MGONGPU_SUPPORTS_MULTICHANNEL // Is this a GPU (CUDA, HIP) or CPU implementation? #ifdef __CUDACC__ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 21a4c6aa74..9d024183db 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -179,6 +179,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -192,6 +196,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { From d8f71cb0e565a40ca8b9dbe3f6147b67a016e1cc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 12:53:11 +0100 Subject: [PATCH 54/96] [jt774] fix clang formatting in CODEGEN backport --- .../madgraph/iolibs/template_files/gpu/Bridge.h | 15 +++++++++------ .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 10 +++++----- epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h | 15 +++++++++------ 3 files changed, 23 insertions(+), 17 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index 7f25eea05c..f9ed70dfde 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -14,18 +14,20 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc -#include -#include -#include -#include //#ifdef __HIPCC__ //#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 //#else //#include // bypass this completely to ease portability on LUMI #803 //#endif + +#include // bypass std::filesystem #803 + +#include +#include +#include +#include #include #include -#include // bypass std::filesystem #803 #include #ifdef MGONGPUCPP_GPUIMPL @@ -269,7 +271,8 @@ namespace mg5amcCpu */ //struct stat dummybuffer; // bypass std::filesystem #803 //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // - auto fileExists = []( std::string& fileName ){ struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 5542e5323b..85a118c983 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005713224411010742  +DEBUG: model prefixing takes 0.005553722381591797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.145 s +ALOHA: aloha creates 2 routines in 0.147 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.623s -user 0m0.466s -sys 0m0.061s +real 0m0.546s +user 0m0.487s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index 7f25eea05c..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -14,18 +14,20 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc -#include -#include -#include -#include //#ifdef __HIPCC__ //#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 //#else //#include // bypass this completely to ease portability on LUMI #803 //#endif + +#include // bypass std::filesystem #803 + +#include +#include +#include +#include #include #include -#include // bypass std::filesystem #803 #include #ifdef MGONGPUCPP_GPUIMPL @@ -269,7 +271,8 @@ namespace mg5amcCpu */ //struct stat dummybuffer; // bypass std::filesystem #803 //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // - auto fileExists = []( std::string& fileName ){ struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } From a80e630c3badcc0ab4d24c79bd36009523b6b53e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 12:54:00 +0100 Subject: [PATCH 55/96] [jt774] regenerate gg_tt.sa (all ok no change) and gg_tt.mad (will now test on LUMI) --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 18 +++++----- .../cudacpp/gg_tt.mad/SubProcesses/Bridge.h | 25 +++++++++++--- .../gg_tt.mad/SubProcesses/GpuAbstraction.h | 2 -- .../gg_tt.mad/SubProcesses/MadgraphTest.h | 13 +++++++- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 6 ++++ .../SubProcesses/P1_gg_ttx/check_sa.cc | 4 +-- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 33 ++++++++++++++++--- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 3 +- epochX/cudacpp/gg_tt.mad/src/read_slha.cc | 13 +++++++- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 +++---- 10 files changed, 98 insertions(+), 31 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 75c84e12fb..7011761fc7 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005261659622192383  +DEBUG: model prefixing takes 0.005719423294067383  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.100 s +Wrote files for 10 helas calls in 0.113 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.144 s +ALOHA: aloha creates 2 routines in 0.152 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.131 s +ALOHA: aloha creates 4 routines in 0.136 s VVV1 FFV1 FFV1 @@ -237,10 +237,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.690s -user 0m1.458s -sys 0m0.220s -Code generation completed in 2 seconds +real 0m2.627s +user 0m1.526s +sys 0m0.244s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index f20c229897..dbaa56b35c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -574,6 +574,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -587,6 +591,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index f2cfa349da..1414661db4 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 55d03f1252..06787c1c5e 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -8,13 +8,14 @@ // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) -#define MGONGPU_SUPPORTS_MULTICHANNEL 1 +#undef MGONGPU_SUPPORTS_MULTICHANNEL // Is this a GPU (CUDA, HIP) or CPU implementation? #ifdef __CUDACC__ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_tt.mad/src/read_slha.cc b/epochX/cudacpp/gg_tt.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_tt.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 85a118c983..b2d1c1a436 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005553722381591797  +DEBUG: model prefixing takes 0.0057523250579833984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.147 s +ALOHA: aloha creates 2 routines in 0.150 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.546s -user 0m0.487s -sys 0m0.054s +real 0m0.551s +user 0m0.495s +sys 0m0.051s Code generation completed in 1 seconds From 2bdc44000338f4953313943a9d8490f01825b2af Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:42:22 +0200 Subject: [PATCH 56/96] [jt774] in gg_tt.mad cudacpp.mk, add -lpthread when linking runTest.exe using $FC=gfortran on LUMI/HIP #802 Otherwise the link was failing using gfortran /opt/cray/pe/gcc/12.2.0/snos/bin/gfortran -o runTest.exe ./CPPProcess.o ./MatrixElementKernels.o ./BridgeKernels.o ./CrossSectionKernels.o ./CommonRandomNumberKernel.o ./RamboSamplingKernels.o ./testxxx.o ./testmisc.o ./runTest.o ./gCPPProcess.o ./gMatrixElementKernels.o ./gBridgeKernels.o ./gCrossSectionKernels.o ./gCommonRandomNumberKernel.o ./gRamboSamplingKernels.o ./testxxx_cu.o ./testmisc_cu.o ./runTest_cu.o -ldl -L../../lib -lmg5amc_common -L../../../../../test/googletest/install_gcc12.2.0/lib64/ -lgtest -lgtest_main -Xlinker -rpath='$ORIGIN/../../lib' -lstdc++ -L/opt/rocm-5.2.3/llvm/bin/../../lib -lamdhip64 /usr/bin/ld: ../../../../../test/googletest/install_gcc12.2.0/lib64//libgtest.a(gtest-all.cc.o): undefined reference to symbol 'pthread_setspecific@@GLIBC_2.2.5' /usr/bin/ld: /lib64/libpthread.so.0: error adding symbols: DSO missing from command line I went back to gfortran because flang gives too many F90 errors on madevent #804 --- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 1414661db4..4a14c9c3a1 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -808,7 +808,7 @@ else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif From ee5e389dbafa65baf5115ef43adc9d87810e2261 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:48:21 +0200 Subject: [PATCH 57/96] [jt774] first (successful) tput tee test on LUMI with AMD GPU using ggtt.mad, after adding pthread in the link #802 Note, the log was created with mad so there was no need to copy it LUMI setup is now the following, using again gcc and gfortran (I dropped flang because of f90 issues #804 in madevent) module load cray-python module load gcc/12.2.0 export FC=`which gfortran` export PATH=~/CCACHE/ccache-4.8.2-INSTALL/bin:$PATH export CCACHE_DIR=~/CCACHE/ccache export USECCACHE=1 Note, as in previous tests, HIP_HOME is automatically determined as /usr (hipcc is in /usr/bin/hipcc) --- .../log_ggtt_mad_d_inl0_hrd0.txt | 240 ++++++++---------- 1 file changed, 102 insertions(+), 138 deletions(-) diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 6cb59cf783..bf7ba27714 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,6 +1,6 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS= AVX=avx2 FPTYPE=d @@ -9,192 +9,156 @@ HRDCOD=0 RNDGEN=hasNoCurand Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -OMPFLAGS= -AVX=avx2 -FPTYPE=d -HELINL=0 -HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -make USEBUILDDIR=1 AVX=none -f makefile -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS= -AVX=none -FPTYPE=d -HELINL=0 -HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.none_d_inl0_hrd0 for tag=none_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) -make[1]: Nothing to be done for 'all.none_d_inl0_hrd0_hasNoCurand'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +make USEBUILDDIR=1 AVX=none +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=sse4 -f makefile -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS= -AVX=sse4 -FPTYPE=d -HELINL=0 -HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.sse4_d_inl0_hrd0 for tag=sse4_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) -make[1]: Nothing to be done for 'all.sse4_d_inl0_hrd0_hasNoCurand'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=avx2 -f makefile -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS= -AVX=avx2 -FPTYPE=d -HELINL=0 -HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) -make[1]: Nothing to be done for 'all.avx2_d_inl0_hrd0_hasNoCurand'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +make USEBUILDDIR=1 AVX=avx2 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=512y -f makefile -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS= -AVX=512y -FPTYPE=d -HELINL=0 -HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) -make[1]: Nothing to be done for 'all.512y_d_inl0_hrd0_hasNoCurand'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=512z -f makefile -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' -OMPFLAGS= -AVX=512z -FPTYPE=d -HELINL=0 -HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.512z_d_inl0_hrd0 for tag=512z_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) -make[1]: Nothing to be done for 'all.512z_d_inl0_hrd0_hasNoCurand'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx' +make USEBUILDDIR=1 AVX=512z +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_13:26:29 +DATE: 2024-01-27_14:46:58 On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.439996e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.279520e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.316726e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.404499e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.288002e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.325396e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.563399 sec - 3,228,384,880 cycles:u # 1.904 GHz (75.56%) - 11,952,819 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.66%) - 1,155,234,954 stalled-cycles-backend:u # 35.78% backend cycles idle (74.40%) - 3,085,212,880 instructions:u # 0.96 insn per cycle - # 0.37 stalled cycles per insn (74.30%) - 2.562005892 seconds time elapsed +TOTAL : 1.637562 sec + 3,236,775,925 cycles:u # 1.957 GHz (74.95%) + 10,679,224 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.83%) + 1,160,637,440 stalled-cycles-backend:u # 35.86% backend cycles idle (75.18%) + 3,096,350,011 instructions:u # 0.96 insn per cycle + # 0.37 stalled cycles per insn (74.24%) + 2.326736373 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.028806338851682 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.506702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.569666e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.569666e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.518940e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.583706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.583706e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.361164 sec - 15,062,604,939 cycles:u # 3.428 GHz (74.91%) - 7,266,330 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) - 3,949,052,022 stalled-cycles-backend:u # 26.22% backend cycles idle (75.06%) - 42,698,627,244 instructions:u # 2.83 insn per cycle - # 0.09 stalled cycles per insn (74.98%) - 4.416609826 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 842) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.342627 sec + 14,964,849,700 cycles:u # 3.421 GHz (74.98%) + 9,503,935 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) + 82,033,110 stalled-cycles-backend:u # 0.55% backend cycles idle (74.88%) + 38,338,800,684 instructions:u # 2.56 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 4.381488281 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.028806338851565 +Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+NOVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=NO] -EvtsPerSec[Rmb+ME] (23) = ( 4.171928e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.352658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.352658e+05 ) sec^-1 +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 4.501722e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.730702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.730702e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.691869 sec - 9,161,333,356 cycles:u # 3.363 GHz (75.06%) - 6,574,969 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.05%) - 2,444,591,382 stalled-cycles-backend:u # 26.68% backend cycles idle (75.06%) - 26,129,370,357 instructions:u # 2.85 insn per cycle - # 0.09 stalled cycles per insn (74.93%) - 2.734076715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2666) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.508724 sec + 8,574,921,805 cycles:u # 3.374 GHz (74.91%) + 9,628,469 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.84%) + 638,187,821 stalled-cycles-backend:u # 7.44% backend cycles idle (74.87%) + 24,227,857,787 instructions:u # 2.83 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 2.549619793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2003) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.028806338851565 -Relative difference = 3.258803992249869e-07 +Avg ME (F77/C++) = 2.0288063388515654 +Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+NOVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=NO] -EvtsPerSec[Rmb+ME] (23) = ( 6.857588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.309976e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.309976e+05 ) sec^-1 +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +EvtsPerSec[Rmb+ME] (23) = ( 7.751144e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.359339e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.359339e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.708082 sec - 5,760,765,780 cycles:u # 3.307 GHz (74.77%) - 6,878,837 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.83%) - 1,175,203,447 stalled-cycles-backend:u # 20.40% backend cycles idle (75.02%) - 12,437,140,071 instructions:u # 2.16 insn per cycle - # 0.09 stalled cycles per insn (75.21%) - 1.761202524 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2696) (512y: 0) (512z: 0) +TOTAL : 1.532068 sec + 5,123,766,171 cycles:u # 3.273 GHz (75.04%) + 8,652,731 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) + 1,100,091,964 stalled-cycles-backend:u # 21.47% backend cycles idle (74.75%) + 11,390,920,751 instructions:u # 2.22 insn per cycle + # 0.10 stalled cycles per insn (74.76%) + 9.287713600 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2248) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.028806338851566 -Relative difference = 3.2588039878720327e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED From 3862f10fa683f28d64d6155ecf0a6207b3459de9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 15:25:38 +0200 Subject: [PATCH 58/96] [jt774] first tmad test on LUMI - it fails for Fortran vs C++ (no HIP)... why? due to link issues #802 again? LUMI setup is now the following, using again gcc and gfortran (I dropped flang because of f90 issues #804 in madevent) module load cray-python module load gcc/12.2.0 export FC=`which gfortran` export PATH=~/CCACHE/ccache-4.8.2-INSTALL/bin:$PATH export CCACHE_DIR=~/CCACHE/ccache export USECCACHE=1 --- .../log_ggtt_mad_d_inl0_hrd0.txt | 541 ++---------------- 1 file changed, 41 insertions(+), 500 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 56f6811ac2..fff22b3e62 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-26_00:16:16 +DATE: 2024-01-27_14:51:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: none]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3633s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3222s - [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3723s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3367s + [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3109s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2699s - [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2603s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2247s + [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7399s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2919s - [COUNTERS] Fortran MEs ( 1 ) : 0.4481s for 90112 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4015s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0121s + [COUNTERS] Fortran MEs ( 1 ) : 0.3894s for 90112 events => throughput is 2.31E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,478 +125,19 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3452s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s + [XSECTION] Cross section = 459.7 [459.65112292885999] fbridge_mode=1 + [UNWEIGHT] Wrote 221 events (found 1195 events) + [COUNTERS] PROGRAM TOTAL : 0.3015s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2698s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0317s for 8192 events => throughput is 2.58E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) - -*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7204s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4016s for 90112 events => throughput is 2.24E+05 events/s - -*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) - -*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.108786e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.114532e+05 ) sec^-1 - -*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3157s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0211s for 8192 events => throughput is 3.88E+05 events/s - -*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5307s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2994s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2313s for 90112 events => throughput is 3.90E+05 events/s - -*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) - -*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.834431e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.792482e+05 ) sec^-1 - -*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3007s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.31E+05 events/s - -*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4410s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2952s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1458s for 90112 events => throughput is 6.18E+05 events/s - -*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.990625e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.045420e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2945s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.32E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4155s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2915s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1240s for 90112 events => throughput is 7.27E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.054793e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.092964e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3090s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2913s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0177s for 8192 events => throughput is 4.63E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4912s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2990s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1922s for 90112 events => throughput is 4.69E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.673944e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.586634e+05 ) sec^-1 - -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6952s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.46E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7071s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7008s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.042409e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.702115e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.012892e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.068226e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.025873e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.153095e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.985581e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.067889e+07 ) sec^-1 - -TEST COMPLETED +ERROR! xsec from fortran (47.690708277600116) and cpp (459.65112292885999) differ by more than 2E-14 (8.638169352681933) From c5a8bdc9e063218fe2426dcaa893c553e0ad2415 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 15:28:57 +0200 Subject: [PATCH 59/96] [jt774] confirm the tmad ggtt test failure even with HIP_HOME=none on LUMI?... HIP_HOME=none ./tmad/teeMadX.sh -ggtt +10x -makeclean --- .../log_ggtt_mad_d_inl0_hrd0.txt | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index fff22b3e62..a439da20fb 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -15,11 +15,11 @@ make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/ make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/e OMP_NUM_THREADS= -DATE: 2024-01-27_14:51:04 +DATE: 2024-01-27_15:28:20 On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: none]: Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,8 +59,8 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3723s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3367s + [COUNTERS] PROGRAM TOTAL : 0.3722s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2603s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2247s - [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2590s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s + [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4015s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0121s - [COUNTERS] Fortran MEs ( 1 ) : 0.3894s for 90112 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4024s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0130s + [COUNTERS] Fortran MEs ( 1 ) : 0.3893s for 90112 events => throughput is 2.31E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 459.7 [459.65112292885999] fbridge_mode=1 [UNWEIGHT] Wrote 221 events (found 1195 events) - [COUNTERS] PROGRAM TOTAL : 0.3015s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2698s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0317s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2882s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2564s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** From 083982d3774013605a2a23294c5754f5a4c97cf5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:32:53 +0100 Subject: [PATCH 60/96] [jt774] well, tmad tests for ggtt fail also on itscrd90 now ./tmad/teeMadX.sh -ggtt +10x -makeclean --- .../log_ggtt_mad_d_inl0_hrd0.txt | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index a439da20fb..68cfe5b9f7 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-27_15:28:20 +DATE: 2024-01-27_14:32:31 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: none]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3722s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s - [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3669s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3248s + [COUNTERS] Fortran MEs ( 1 ) : 0.0420s for 8192 events => throughput is 1.95E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2590s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s - [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3222s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4024s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0130s - [COUNTERS] Fortran MEs ( 1 ) : 0.3893s for 90112 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8019s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3411s + [COUNTERS] Fortran MEs ( 1 ) : 0.4608s for 90112 events => throughput is 1.96E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 459.7 [459.65112292885999] fbridge_mode=1 [UNWEIGHT] Wrote 221 events (found 1195 events) - [COUNTERS] PROGRAM TOTAL : 0.2882s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2564s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 8192 events => throughput is 2.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3569s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0371s for 8192 events => throughput is 2.21E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** From 4f0f2777fb8c56c2d8fd9aa2b4cf8e45d0a6fdfc Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:35:34 +0100 Subject: [PATCH 61/96] [jt774] fix CODEGEN mgOnGpuConfig.h where I had accidentally hardcoded the no-multichannel case as in sa (so tmad results are wrong because of that) --- .../madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index d16a94431d..989b3f0eea 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -8,7 +8,7 @@ // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) -#undef MGONGPU_SUPPORTS_MULTICHANNEL +%(mgongpu_supports_multichannel)s // Is this a GPU (CUDA, HIP) or CPU implementation? #ifdef __CUDACC__ From 2fc0d87823bdbfb461899e1454c3ea8a0b90490b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:38:48 +0100 Subject: [PATCH 62/96] [jt774] in CODEGEN cudacpp.mk, backport form gg_tt.mad, add -lpthread when linking runTest.exe using $FC=gfortran on LUMI/HIP #802 Otherwise the link was failing using gfortran /opt/cray/pe/gcc/12.2.0/snos/bin/gfortran -o runTest.exe ./CPPProcess.o ./MatrixElementKernels.o ./BridgeKernels.o ./CrossSectionKernels.o ./CommonRa> /usr/bin/ld: ../../../../../test/googletest/install_gcc12.2.0/lib64//libgtest.a(gtest-all.cc.o): undefined reference to symbol 'pthread_setspecific@@> /usr/bin/ld: /lib64/libpthread.so.0: error adding symbols: DSO missing from command line I went back to gfortran because flang gives too many F90 errors on madevent #804 --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index fe6cb442da..8b5fc00a83 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -808,7 +808,7 @@ else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif From d1368d0488bd2f930064958502636eeece719a98 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:39:44 +0100 Subject: [PATCH 63/96] [jt774] regenerate gg_tt.mad with the latest fixes --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 18 +++++++++--------- .../cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 2 +- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 7011761fc7..dec257ae6a 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005719423294067383  +DEBUG: model prefixing takes 0.005725383758544922  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.113 s +Wrote files for 10 helas calls in 0.103 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.152 s +ALOHA: aloha creates 2 routines in 0.148 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.136 s +ALOHA: aloha creates 4 routines in 0.134 s VVV1 FFV1 FFV1 @@ -237,10 +237,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.627s -user 0m1.526s -sys 0m0.244s -Code generation completed in 3 seconds +real 0m1.739s +user 0m1.511s +sys 0m0.225s +Code generation completed in 1 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 4a14c9c3a1..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -808,7 +808,7 @@ else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 06787c1c5e..69cee0085b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -8,7 +8,7 @@ // HARDCODED AT CODE GENERATION TIME: DO NOT MODIFY (#473) // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) -#undef MGONGPU_SUPPORTS_MULTICHANNEL +#define MGONGPU_SUPPORTS_MULTICHANNEL 1 // Is this a GPU (CUDA, HIP) or CPU implementation? #ifdef __CUDACC__ From 8bceeae053a285b1288805ba93f0bff6519c8cb9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 14:42:12 +0100 Subject: [PATCH 64/96] [jt774] rerun ggtt tmad on itscrd90, now all ok --- .../log_ggtt_mad_d_inl0_hrd0.txt | 495 +++++++++++++++++- 1 file changed, 477 insertions(+), 18 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 68cfe5b9f7..f1ecea3090 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -2,8 +2,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -16,11 +16,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-27_14:32:31 +DATE: 2024-01-27_14:41:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3669s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3248s - [COUNTERS] Fortran MEs ( 1 ) : 0.0420s for 8192 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3665s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s + [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.90E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,8 +84,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] PROGRAM TOTAL : 0.3237s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2815s [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8019s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3411s - [COUNTERS] Fortran MEs ( 1 ) : 0.4608s for 90112 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8163s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3533s + [COUNTERS] Fortran MEs ( 1 ) : 0.4630s for 90112 events => throughput is 1.95E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,12 +132,471 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 459.7 [459.65112292885999] fbridge_mode=1 - [UNWEIGHT] Wrote 221 events (found 1195 events) - [COUNTERS] PROGRAM TOTAL : 0.3569s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0371s for 8192 events => throughput is 2.21E+05 events/s + [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3558s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3178s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 8192 events => throughput is 2.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -ERROR! xsec from fortran (47.690708277600116) and cpp (459.65112292885999) differ by more than 2E-14 (8.638169352681933) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) + +*** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-none) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.7892s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3701s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4191s for 90112 events => throughput is 2.15E+05 events/s + +*** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) + +*** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.181466e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.182746e+05 ) sec^-1 + +*** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3240s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3022s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.76E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-sse4) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5908s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3496s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2413s for 90112 events => throughput is 3.73E+05 events/s + +*** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) + +*** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.705559e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.718386e+05 ) sec^-1 + +*** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3102s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.05E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-avx2) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.4980s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3456s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1524s for 90112 events => throughput is 5.91E+05 events/s + +*** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.873762e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.005966e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3061s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2945s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.4755s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3444s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1311s for 90112 events => throughput is 6.87E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.775303e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.835683e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3182s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2998s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0184s for 8192 events => throughput is 4.45E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5550s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2043s for 90112 events => throughput is 4.41E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.412661e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.322755e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.7269s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7263s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.7703s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7632s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 90112 events => throughput is 1.26E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.883799e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.692882e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.982245e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.072110e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.987082e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.153423e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.980458e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.035030e+07 ) sec^-1 + +TEST COMPLETED From 39472bdd9e382894c81cd11594c099d6c8c2ff8c Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 15:01:01 +0100 Subject: [PATCH 65/96] [jt774] in tmad/madX.sh, add support for AMD GPUs as done in tput (still assuming that there is only one GPU, either NVidia or AMD) --- epochX/cudacpp/tmad/madX.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index d03ca9f65f..991ab62ea2 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -420,7 +420,13 @@ printf "\nOMP_NUM_THREADS=$OMP_NUM_THREADS\n" printf "\nDATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n" -if nvidia-smi -L > /dev/null 2>&1; then gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)"; else gpuTxt=none; fi +if nvidia-smi -L > /dev/null 2>&1; then + gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)" +elif rocm-smi -i > /dev/null 2>&1; then + gpuTxt="$(rocm-smi --showproductname | grep 'Card series' | awk '{print $5,$6,$7}')" +else + gpuTxt=none +fi if [ "${unames}" == "Darwin" ]; then cpuTxt=$(sysctl -h machdep.cpu.brand_string) cpuTxt=${cpuTxt/machdep.cpu.brand_string: } From a6643d88cea8db64a7474519eba5c0d9b1c7db23 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 16:02:53 +0200 Subject: [PATCH 66/96] [jt774] first successful tmad test for ggtt on LUMI/HIP (note, no 512y/z), including HIP (indicated as 'cuda') --- .../log_ggtt_mad_d_inl0_hrd0.txt | 392 ++++++------------ 1 file changed, 122 insertions(+), 270 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index f1ecea3090..92fac99c28 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-27_14:41:27 +DATE: 2024-01-27_16:01:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3665s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s - [COUNTERS] Fortran MEs ( 1 ) : 0.0430s for 8192 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s + [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3237s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2815s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2612s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2256s + [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8163s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3533s - [COUNTERS] Fortran MEs ( 1 ) : 0.4630s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4043s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0148s + [COUNTERS] Fortran MEs ( 1 ) : 0.3895s for 90112 events => throughput is 2.31E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3558s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 8192 events => throughput is 2.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3026s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2706s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7892s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4191s for 90112 events => throughput is 2.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4012s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0493s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3518s for 90112 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.181466e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.610898e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.182746e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.592454e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3240s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3022s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0218s for 8192 events => throughput is 3.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2670s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2493s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0178s for 8192 events => throughput is 4.61E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5908s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3496s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2413s for 90112 events => throughput is 3.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2278s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0323s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1955s for 90112 events => throughput is 4.61E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.705559e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.735718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718386e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.759032e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3102s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2506s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2403s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 8.01E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4980s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3456s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1524s for 90112 events => throughput is 5.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1386s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0262s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1124s for 90112 events => throughput is 8.02E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.873762e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.005966e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3061s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2945s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4755s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3444s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1311s for 90112 events => throughput is 6.87E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.775303e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.251333e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.835683e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3182s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2998s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0184s for 8192 events => throughput is 4.45E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5550s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3507s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2043s for 90112 events => throughput is 4.41E+05 events/s +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.283006e+05 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.412661e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.322755e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7263s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.8015s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8008s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7703s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7632s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 90112 events => throughput is 1.26E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3066s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2989s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.883799e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.540459e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.692882e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.038013e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.982245e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.788135e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.072110e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.755971e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.987082e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.786660e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.153423e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.950104e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.980458e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.754388e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.035030e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.139605e+07 ) sec^-1 TEST COMPLETED From 410550905c25ea10f5b22005dbfb30197498fbf3 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 15:08:14 +0100 Subject: [PATCH 67/96] [jt774] regenerate all processes --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 18 +++--- .../cudacpp/ee_mumu.mad/SubProcesses/Bridge.h | 25 ++++++-- .../ee_mumu.mad/SubProcesses/GpuAbstraction.h | 2 - .../ee_mumu.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P1_epem_mupmum/CPPProcess.cc | 6 ++ .../SubProcesses/P1_epem_mupmum/check_sa.cc | 4 +- .../ee_mumu.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/ee_mumu.mad/src/read_slha.cc | 13 ++++- .../CODEGEN_cudacpp_ee_mumu_log.txt | 12 ++-- .../cudacpp/ee_mumu.sa/SubProcesses/Bridge.h | 25 ++++++-- .../ee_mumu.sa/SubProcesses/GpuAbstraction.h | 2 - .../ee_mumu.sa/SubProcesses/MadgraphTest.h | 13 ++++- .../P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 6 ++ .../P1_Sigma_sm_epem_mupmum/check_sa.cc | 4 +- .../ee_mumu.sa/SubProcesses/cudacpp.mk | 33 +++++++++-- epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/ee_mumu.sa/src/read_slha.cc | 13 ++++- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 ++--- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 ++-- .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 2 +- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 22 +++---- .../gg_tt01g.mad/SubProcesses/Bridge.h | 25 ++++++-- .../SubProcesses/GpuAbstraction.h | 2 - .../gg_tt01g.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P1_gg_ttx/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gg_ttx/check_sa.cc | 4 +- .../SubProcesses/P2_gg_ttxg/CPPProcess.cc | 6 ++ .../SubProcesses/P2_gg_ttxg/check_sa.cc | 4 +- .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc | 13 ++++- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 22 +++---- .../cudacpp/gg_ttg.mad/SubProcesses/Bridge.h | 25 ++++++-- .../gg_ttg.mad/SubProcesses/GpuAbstraction.h | 2 - .../gg_ttg.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gg_ttxg/check_sa.cc | 4 +- .../gg_ttg.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_ttg.mad/src/read_slha.cc | 13 ++++- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 12 ++-- .../cudacpp/gg_ttg.sa/SubProcesses/Bridge.h | 25 ++++++-- .../gg_ttg.sa/SubProcesses/GpuAbstraction.h | 2 - .../gg_ttg.sa/SubProcesses/MadgraphTest.h | 13 ++++- .../P1_Sigma_sm_gg_ttxg/CPPProcess.cc | 6 ++ .../P1_Sigma_sm_gg_ttxg/check_sa.cc | 4 +- .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 33 +++++++++-- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_ttg.sa/src/read_slha.cc | 13 ++++- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 20 +++---- .../cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h | 25 ++++++-- .../gg_ttgg.mad/SubProcesses/GpuAbstraction.h | 2 - .../gg_ttgg.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P1_gg_ttxgg/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gg_ttxgg/check_sa.cc | 4 +- .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc | 13 ++++- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 16 ++--- .../cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h | 25 ++++++-- .../gg_ttgg.sa/SubProcesses/GpuAbstraction.h | 2 - .../gg_ttgg.sa/SubProcesses/MadgraphTest.h | 13 ++++- .../P1_Sigma_sm_gg_ttxgg/CPPProcess.cc | 6 ++ .../P1_Sigma_sm_gg_ttxgg/check_sa.cc | 4 +- .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 33 +++++++++-- epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc | 13 ++++- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 20 +++---- .../gg_ttggg.mad/SubProcesses/Bridge.h | 25 ++++++-- .../SubProcesses/GpuAbstraction.h | 2 - .../gg_ttggg.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P1_gg_ttxggg/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gg_ttxggg/check_sa.cc | 4 +- .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc | 13 ++++- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 16 ++--- .../cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h | 25 ++++++-- .../gg_ttggg.sa/SubProcesses/GpuAbstraction.h | 2 - .../gg_ttggg.sa/SubProcesses/MadgraphTest.h | 13 ++++- .../P1_Sigma_sm_gg_ttxggg/CPPProcess.cc | 6 ++ .../P1_Sigma_sm_gg_ttxggg/check_sa.cc | 4 +- .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc | 13 ++++- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 22 +++---- .../cudacpp/gq_ttq.mad/SubProcesses/Bridge.h | 25 ++++++-- .../gq_ttq.mad/SubProcesses/GpuAbstraction.h | 2 - .../gq_ttq.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gu_ttxu/check_sa.cc | 4 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gux_ttxux/check_sa.cc | 4 +- .../gq_ttq.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gq_ttq.mad/src/read_slha.cc | 13 ++++- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 16 ++--- .../cudacpp/gq_ttq.sa/SubProcesses/Bridge.h | 25 ++++++-- .../gq_ttq.sa/SubProcesses/GpuAbstraction.h | 2 - .../gq_ttq.sa/SubProcesses/MadgraphTest.h | 13 ++++- .../P1_Sigma_sm_gu_ttxu/CPPProcess.cc | 6 ++ .../P1_Sigma_sm_gu_ttxu/check_sa.cc | 4 +- .../P1_Sigma_sm_gux_ttxux/CPPProcess.cc | 6 ++ .../P1_Sigma_sm_gux_ttxux/check_sa.cc | 4 +- .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 33 +++++++++-- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/gq_ttq.sa/src/read_slha.cc | 13 ++++- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 10 ++-- .../heft_gg_h.sa/SubProcesses/Bridge.h | 25 ++++++-- .../SubProcesses/GpuAbstraction.h | 2 - .../heft_gg_h.sa/SubProcesses/MadgraphTest.h | 13 ++++- .../P1_Sigma_heft_gg_h/CPPProcess.cc | 6 ++ .../P1_Sigma_heft_gg_h/check_sa.cc | 4 +- .../heft_gg_h.sa/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc | 13 ++++- .../CODEGEN_mad_pp_tt012j_log.txt | 58 +++++++++---------- .../pp_tt012j.mad/SubProcesses/Bridge.h | 25 ++++++-- .../SubProcesses/GpuAbstraction.h | 2 - .../pp_tt012j.mad/SubProcesses/MadgraphTest.h | 13 ++++- .../SubProcesses/P0_gg_ttx/CPPProcess.cc | 6 ++ .../SubProcesses/P0_gg_ttx/check_sa.cc | 4 +- .../SubProcesses/P0_uux_ttx/CPPProcess.cc | 6 ++ .../SubProcesses/P0_uux_ttx/check_sa.cc | 4 +- .../SubProcesses/P1_gg_ttxg/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gg_ttxg/check_sa.cc | 4 +- .../SubProcesses/P1_gu_ttxu/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gu_ttxu/check_sa.cc | 4 +- .../SubProcesses/P1_gux_ttxux/CPPProcess.cc | 6 ++ .../SubProcesses/P1_gux_ttxux/check_sa.cc | 4 +- .../SubProcesses/P1_uux_ttxg/CPPProcess.cc | 6 ++ .../SubProcesses/P1_uux_ttxg/check_sa.cc | 4 +- .../SubProcesses/P2_gg_ttxgg/CPPProcess.cc | 6 ++ .../SubProcesses/P2_gg_ttxgg/check_sa.cc | 4 +- .../SubProcesses/P2_gg_ttxuux/CPPProcess.cc | 6 ++ .../SubProcesses/P2_gg_ttxuux/check_sa.cc | 4 +- .../SubProcesses/P2_gu_ttxgu/CPPProcess.cc | 6 ++ .../SubProcesses/P2_gu_ttxgu/check_sa.cc | 4 +- .../SubProcesses/P2_gux_ttxgux/CPPProcess.cc | 6 ++ .../SubProcesses/P2_gux_ttxgux/check_sa.cc | 4 +- .../SubProcesses/P2_uc_ttxuc/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uc_ttxuc/check_sa.cc | 4 +- .../SubProcesses/P2_ucx_ttxucx/CPPProcess.cc | 6 ++ .../SubProcesses/P2_ucx_ttxucx/check_sa.cc | 4 +- .../SubProcesses/P2_uu_ttxuu/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uu_ttxuu/check_sa.cc | 4 +- .../SubProcesses/P2_uux_ttxccx/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uux_ttxccx/check_sa.cc | 4 +- .../SubProcesses/P2_uux_ttxgg/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uux_ttxgg/check_sa.cc | 4 +- .../SubProcesses/P2_uux_ttxuux/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uux_ttxuux/check_sa.cc | 4 +- .../P2_uxcx_ttxuxcx/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc | 4 +- .../P2_uxux_ttxuxux/CPPProcess.cc | 6 ++ .../SubProcesses/P2_uxux_ttxuxux/check_sa.cc | 4 +- .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 33 +++++++++-- .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h | 1 + epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc | 13 ++++- 160 files changed, 1373 insertions(+), 382 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index dd0f31341f..b9e8f6df36 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005403280258178711  +DEBUG: model prefixing takes 0.005430698394775391  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.004 s +1 processes with 2 diagrams generated in 0.005 s Total: 1 processes with 2 diagrams output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.098 s +Wrote files for 8 helas calls in 0.101 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.200 s +ALOHA: aloha creates 3 routines in 0.205 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.537 s +ALOHA: aloha creates 7 routines in 0.260 s FFV1 FFV1 FFV2 @@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.147s -user 0m1.627s -sys 0m0.231s +real 0m1.923s +user 0m1.688s +sys 0m0.227s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 83e5b15013..c57ff8d2b0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -559,6 +559,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -572,6 +576,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc b/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 20d35a4a26..e9dbec802c 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005757331848144531  +DEBUG: model prefixing takes 0.0057604312896728516  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.004 s +1 processes with 2 diagrams generated in 0.005 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT @@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.267 s +ALOHA: aloha creates 4 routines in 0.277 s FFV1 FFV1 FFV2 @@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.662s -user 0m0.596s -sys 0m0.051s +real 0m0.683s +user 0m0.604s +sys 0m0.071s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 13429436af..b87b14d41f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -557,6 +557,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -570,6 +574,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc b/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index dec257ae6a..9e8e783b82 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005725383758544922  +DEBUG: model prefixing takes 0.0057849884033203125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,7 +191,7 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.104 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines @@ -200,7 +200,7 @@ ALOHA: aloha creates 2 routines in 0.148 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.134 s +ALOHA: aloha creates 4 routines in 0.135 s VVV1 FFV1 FFV1 @@ -237,10 +237,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.739s -user 0m1.511s -sys 0m0.225s -Code generation completed in 1 seconds +real 0m1.738s +user 0m1.518s +sys 0m0.218s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index b2d1c1a436..5091b00a3e 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057523250579833984  +DEBUG: model prefixing takes 0.005614757537841797  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.150 s +ALOHA: aloha creates 2 routines in 0.147 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.551s -user 0m0.495s -sys 0m0.051s +real 0m0.552s +user 0m0.489s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 1414661db4..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -808,7 +808,7 @@ else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 - $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 else $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) endif diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index f38b6ec6e6..8042cf580a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005505561828613281  +DEBUG: model prefixing takes 0.0056362152099609375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s -Wrote files for 46 helas calls in 0.243 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s +Wrote files for 46 helas calls in 0.250 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.324 s +ALOHA: aloha creates 5 routines in 0.334 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.308 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.484s -user 0m2.030s -sys 0m0.256s -Code generation completed in 3 seconds +real 0m2.353s +user 0m2.114s +sys 0m0.226s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index f20c229897..dbaa56b35c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -574,6 +574,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -587,6 +591,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index 3c7715b235..f80a0127b0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -795,6 +795,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -808,6 +812,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc b/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 00ae96c5fb..1dbabb94a0 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0055010318756103516  +DEBUG: model prefixing takes 0.005898475646972656  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.023 s Total: 1 processes with 16 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s -Wrote files for 36 helas calls in 0.184 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s +Wrote files for 36 helas calls in 0.158 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.325 s +ALOHA: aloha creates 5 routines in 0.348 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.310 s +ALOHA: aloha creates 10 routines in 0.329 s VVV1 VVV1 FFV1 @@ -252,10 +252,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.571s -user 0m1.941s -sys 0m0.238s -Code generation completed in 2 seconds +real 0m4.472s +user 0m2.033s +sys 0m0.236s +Code generation completed in 5 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 0e4d5d1157..2fa9b4f651 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -795,6 +795,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -808,6 +812,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc b/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index ee1a51555d..81fcb8c8ed 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054416656494140625  +DEBUG: model prefixing takes 0.005418062210083008  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.022 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.345 s +ALOHA: aloha creates 5 routines in 0.335 s VVV1 VVV1 FFV1 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.803s -user 0m0.731s -sys 0m0.066s +real 0m0.807s +user 0m0.747s +sys 0m0.049s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 2e02593919..661197ace8 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -789,6 +789,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -802,6 +806,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc b/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 3a2b1ad647..a9bcc2504b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0053348541259765625  +DEBUG: model prefixing takes 0.005433797836303711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.156 s +1 processes with 123 diagrams generated in 0.161 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.437 s -Wrote files for 222 helas calls in 0.735 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.446 s +Wrote files for 222 helas calls in 0.719 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.441 s +ALOHA: aloha creates 5 routines in 0.347 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -255,9 +255,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.582s -user 0m3.061s -sys 0m0.243s +real 0m3.429s +user 0m3.139s +sys 0m0.247s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 2f4b1f9d0e..0bb184f0e6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -2741,6 +2741,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -2754,6 +2758,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc b/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 1b6c420503..f908e4a331 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005376100540161133  +DEBUG: model prefixing takes 0.005699872970581055  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.156 s +1 processes with 123 diagrams generated in 0.160 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.427 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.319 s +ALOHA: aloha creates 5 routines in 0.351 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.461s -user 0m1.381s -sys 0m0.050s -Code generation completed in 1 seconds +real 0m1.496s +user 0m1.397s +sys 0m0.068s +Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index d59cc349e3..f2a85b9b75 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -2798,6 +2798,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -2811,6 +2815,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc b/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index f222e5a6b5..cdbfdbd3d7 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005517005920410156  +DEBUG: model prefixing takes 0.005588531494140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.861 s +1 processes with 1240 diagrams generated in 1.900 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.528 s -Wrote files for 2281 helas calls in 18.450 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.686 s +Wrote files for 2281 helas calls in 18.830 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.314 s +ALOHA: aloha creates 5 routines in 0.329 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.309 s +ALOHA: aloha creates 10 routines in 0.323 s VVV1 VVV1 FFV1 @@ -257,9 +257,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.049s -user 0m28.554s -sys 0m0.393s +real 0m29.715s +user 0m29.204s +sys 0m0.406s Code generation completed in 30 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index a478ecb28e..a041636caf 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -30408,6 +30408,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -30421,6 +30425,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc b/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 2720870321..31cde146a9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005664825439453125  +DEBUG: model prefixing takes 0.0054972171783447266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.872 s +1 processes with 1240 diagrams generated in 1.905 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.609 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.694 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.345 s +ALOHA: aloha creates 5 routines in 1.198 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m12.978s -user 0m12.813s -sys 0m0.111s -Code generation completed in 13 seconds +real 0m14.110s +user 0m13.103s +sys 0m0.119s +Code generation completed in 14 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index fa23301c50..ff20b7ba63 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -32298,6 +32298,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -32311,6 +32315,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc b/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index bb803498ee..2a21d715bb 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005455732345581055  +DEBUG: model prefixing takes 0.005668163299560547  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.077 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s -Wrote files for 32 helas calls in 0.216 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s +Wrote files for 32 helas calls in 0.229 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.143 s +ALOHA: aloha creates 2 routines in 0.150 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.130 s +ALOHA: aloha creates 4 routines in 0.136 s FFV1 FFV1 FFV1 @@ -294,9 +294,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.916s -user 0m1.672s -sys 0m0.240s +real 0m1.985s +user 0m1.757s +sys 0m0.224s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index a376b0c455..4ece50575c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -631,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -644,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 41f17b9fb0..fee492fbc1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -631,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -644,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc b/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 5a07808142..c91b123988 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005926370620727539  +DEBUG: model prefixing takes 0.005486726760864258  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.082 s +8 processes with 40 diagrams generated in 0.080 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -206,12 +206,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.179 s +ALOHA: aloha creates 2 routines in 0.148 s FFV1 FFV1 FFV1 @@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m1.076s -user 0m0.601s -sys 0m0.061s -Code generation completed in 1 seconds +real 0m0.784s +user 0m0.611s +sys 0m0.049s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index c1543791ca..497f35fa66 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -626,6 +626,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -639,6 +643,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index a9294d1fea..61269d2eac 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -626,6 +626,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -639,6 +643,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc b/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 9bac4b3aae..db94d58de4 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.060 s +ALOHA: aloha creates 1 routines in 0.063 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.414s -user 0m0.350s -sys 0m0.059s -Code generation completed in 1 seconds +real 0m0.441s +user 0m0.375s +sys 0m0.052s +Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 3b6085c784..624791e8b3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -525,6 +525,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -538,6 +542,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h index da4ba36ad8..06787c1c5e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc b/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index adfd21027c..15bcd183c5 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0053827762603759766  +DEBUG: model prefixing takes 0.005548954010009766  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.029 s +5 processes with 7 diagrams generated in 0.030 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.136 s +13 processes with 76 diagrams generated in 0.140 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.297 s -Wrote files for 810 helas calls in 3.533 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.313 s +Wrote files for 810 helas calls in 3.363 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.333 s +ALOHA: aloha creates 5 routines in 0.361 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.310 s +ALOHA: aloha creates 10 routines in 0.330 s VVV1 VVV1 FFV1 @@ -1028,10 +1028,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.184s -user 0m8.370s -sys 0m0.508s -Code generation completed in 9 seconds +real 0m9.143s +user 0m8.594s +sys 0m0.500s +Code generation completed in 10 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index 89437b4c42..f9ed70dfde 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -14,11 +14,18 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include @@ -255,10 +262,18 @@ namespace mg5amcCpu // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h index 6a7d9c05c0..9c467b1e04 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -39,8 +39,6 @@ #elif defined __HIPCC__ -#include "hip/hip_runtime.h" - #define gpuError_t hipError_t #define gpuPeekAtLastError hipPeekAtLastError #define gpuGetErrorString hipGetErrorString diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h index a64c05c26a..6054185300 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 40d8bdea5f..c465192676 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -574,6 +574,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -587,6 +591,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 5f57cf55f3..85bdc6bf24 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -551,6 +551,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -564,6 +568,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index 0e4d5d1157..2fa9b4f651 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -795,6 +795,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -808,6 +812,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index e098c03e3a..f505a0d8c0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -631,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -644,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 7308f8a2c7..9cba1a6d8c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -631,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -644,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index b37df5d33f..222171c5cc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -631,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -644,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index b4df38fb35..c374ce3189 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -2741,6 +2741,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -2754,6 +2758,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index bc38d1f109..6e93814e8e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -1136,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1149,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index a17bd3518e..ed55777a81 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -1136,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1149,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index 6a53d09c8e..1e9d03033f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -1136,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1149,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index fedf955b6a..75930b65cd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -711,6 +711,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -724,6 +728,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index fc99b3bfae..b12263362e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -717,6 +717,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -730,6 +734,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 97912e5855..d55c7270e6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -821,6 +821,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -834,6 +838,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index be2315b035..ed38fae08c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -717,6 +717,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -730,6 +734,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index c83b7be449..b127281504 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -1136,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1149,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 3ecdb48914..80d2682458 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -821,6 +821,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -834,6 +838,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index e21d1f0c48..8dd445388c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -711,6 +711,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -724,6 +728,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index 527b1d3c8f..08e9ed321f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -821,6 +821,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -834,6 +838,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc index 7cac5ab47b..aab490dc5b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc @@ -76,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -84,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index f2cfa349da..eefac8ff0d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -139,13 +139,13 @@ endif # If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif # If HIP_HOME is not set, try to set it from the path to hipcc ifndef HIP_HOME - HIP_HOME = $(patsubst %bin/hipcc,%,$(HIP_COMPILER_PATH)) + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) $(warning HIP_HOME was not set: using "$(HIP_HOME)") endif @@ -294,7 +294,9 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) @@ -652,6 +654,12 @@ $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -701,7 +709,11 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) @@ -713,8 +725,12 @@ $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgf endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) endif +endif #------------------------------------------------------------------------------- @@ -779,6 +795,11 @@ $(testmain): LIBFLAGS += -lgomp endif endif +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) @@ -786,7 +807,11 @@ $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_object else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 55d03f1252..69cee0085b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -15,6 +15,7 @@ #define MGONGPUCPP_GPUIMPL cuda #elif defined __HIPCC__ #define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h #else #undef MGONGPUCPP_GPUIMPL #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc b/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { From 3615bb12ea6b76aa3a174abe3129b2b31b136daa Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 16:15:36 +0200 Subject: [PATCH 68/96] [jt774] in tput/allTees.sh, add -makeonly to only build all tests instead of building and running them This is useful on LUMI to build on the login node, before spending the allocation on the GPU node to run them --- epochX/cudacpp/tput/allTees.sh | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh index f67fa5eccf..4d1599e547 100755 --- a/epochX/cudacpp/tput/allTees.sh +++ b/epochX/cudacpp/tput/allTees.sh @@ -5,8 +5,9 @@ scrdir=$(cd $(dirname $0); pwd) -# By default, use the madevent+cudacpp version of code and tee scripts -sa= +# By default, use the madevent+cudacpp version of code and tee scripts (use -sa to use the standalone version instead) +# By default, build and run all tests (use -makeonly to only build all tests) +opts= suff=".mad" # Parse command line arguments @@ -22,11 +23,15 @@ while [ "$1" != "" ]; do shift elif [ "$1" == "-sa" ]; then # Use standalone_cudacpp builds instead of madevent+cudacpp? - sa=-sa + opts+=" -sa" suff=".sa" shift + elif [ "$1" == "-makeonly" ]; then + # Only build all tests instead of building and running them? + opts+=" -makeonly" + shift else - echo "Usage: $0 [-short] [-e] [-sa]" + echo "Usage: $0 [-short] [-e] [-sa] [-makeonly]" exit 1 fi done @@ -40,7 +45,7 @@ started="STARTED AT $(date)" # (36/78) Six logs (double/float/mixed x hrd0/hrd1 x inl0) in each of the six processes \rm -rf gg_ttggg${suff}/lib/build.none_* -cmd="./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -makeclean ${sa}" +cmd="./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -makeclean ${opts}" $cmd; status=$? ended1="$cmd\nENDED(1) AT $(date) [Status=$status]" tmp1=$(mktemp) @@ -49,29 +54,29 @@ ls -ltr ee_mumu${suff}/lib/build.none_*_inl0_hrd* gg_tt${suff}/lib/build.none_*_ # (48/78) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six processes \rm -rf gg_ttg${suff}/lib/build.none_* \rm -rf gg_ttggg${suff}/lib/build.none_* -cmd="./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ${sa}" +cmd="./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ${opts}" $cmd; status=$? ended2="$cmd\nENDED(2) AT $(date) [Status=$status]" tmp2=$(mktemp) ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2 # (60/78) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six processes (rebuild from cache) -cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -flt -bridge -makeclean ${sa}" +cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -flt -bridge -makeclean ${opts}" $cmd; status=$? ended3="$cmd\nENDED(3) AT $(date) [Status=$status]" # (66/78) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ${sa}" +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ${opts}" $cmd; status=$? ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" # (72/78) Two extra logs (double/float x hrd0 x inl0 + curhst) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ${sa}" +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ${opts}" $cmd; status=$? ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" # (78/78) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ${sa}" +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ${opts}" $cmd; status=$? ended6="$cmd\nENDED(6) AT $(date) [Status=$status]" @@ -92,8 +97,8 @@ echo -e "$ended5" if [ "$ggttggg" == "" ]; then echo echo "To complete the test for ggttggg type:" - echo " ./tput/teeThroughputX.sh -flt -hrd -makej -ggttggg -makeclean ${sa}" - echo " ./tput/teeThroughputX.sh -makej -ggttggg -flt -bridge -makeclean ${sa}" + echo " ./tput/teeThroughputX.sh -flt -hrd -makej -ggttggg -makeclean ${opts}" + echo " ./tput/teeThroughputX.sh -makej -ggttggg -flt -bridge -makeclean ${opts}" fi # Print out any errors in the logs From fc091447f9b2f280e60f4d132c3ec5da4836a361 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sat, 27 Jan 2024 16:32:19 +0200 Subject: [PATCH 69/96] [jt774] Fix Jorgen's name in CODEGEN COPYRIGHT --- .../CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full From bf4a53f1220775b4d06ea15cf457991804e4e140 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 28 Jan 2024 01:07:16 +0100 Subject: [PATCH 70/96] [jt744] rerun all 78 tput tests on itscrd90, all ok STARTED AT Sat Jan 27 03:09:21 PM CET 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Sat Jan 27 06:47:50 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Sat Jan 27 07:17:08 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Sat Jan 27 07:27:13 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Sat Jan 27 07:30:35 PM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Sat Jan 27 07:33:55 PM CET 2024 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 86 +++---- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 86 +++---- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 86 +++---- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 86 +++---- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 86 +++---- .../log_eemumu_mad_d_inl0_hrd1.txt | 86 +++---- .../log_eemumu_mad_d_inl1_hrd0.txt | 86 +++---- .../log_eemumu_mad_d_inl1_hrd1.txt | 86 +++---- .../log_eemumu_mad_f_inl0_hrd0.txt | 86 +++---- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 86 +++---- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 86 +++---- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 86 +++---- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 86 +++---- .../log_eemumu_mad_f_inl0_hrd1.txt | 86 +++---- .../log_eemumu_mad_f_inl1_hrd0.txt | 86 +++---- .../log_eemumu_mad_f_inl1_hrd1.txt | 86 +++---- .../log_eemumu_mad_m_inl0_hrd0.txt | 86 +++---- .../log_eemumu_mad_m_inl0_hrd1.txt | 86 +++---- .../log_ggtt_mad_d_inl0_hrd0.txt | 229 +++++++++++------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 86 +++---- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 86 +++---- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 86 +++---- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 86 +++---- .../log_ggtt_mad_d_inl0_hrd1.txt | 86 +++---- .../log_ggtt_mad_d_inl1_hrd0.txt | 86 +++---- .../log_ggtt_mad_d_inl1_hrd1.txt | 86 +++---- .../log_ggtt_mad_f_inl0_hrd0.txt | 86 +++---- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 86 +++---- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 86 +++---- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 86 +++---- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 86 +++---- .../log_ggtt_mad_f_inl0_hrd1.txt | 86 +++---- .../log_ggtt_mad_f_inl1_hrd0.txt | 86 +++---- .../log_ggtt_mad_f_inl1_hrd1.txt | 86 +++---- .../log_ggtt_mad_m_inl0_hrd0.txt | 86 +++---- .../log_ggtt_mad_m_inl0_hrd1.txt | 86 +++---- .../log_ggttg_mad_d_inl0_hrd0.txt | 100 ++++---- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 100 ++++---- .../log_ggttg_mad_d_inl0_hrd1.txt | 100 ++++---- .../log_ggttg_mad_f_inl0_hrd0.txt | 100 ++++---- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 100 ++++---- .../log_ggttg_mad_f_inl0_hrd1.txt | 100 ++++---- .../log_ggttg_mad_m_inl0_hrd0.txt | 100 ++++---- .../log_ggttg_mad_m_inl0_hrd1.txt | 100 ++++---- .../log_ggttgg_mad_d_inl0_hrd0.txt | 100 ++++---- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 100 ++++---- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 100 ++++---- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 100 ++++---- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 100 ++++---- .../log_ggttgg_mad_d_inl0_hrd1.txt | 100 ++++---- .../log_ggttgg_mad_d_inl1_hrd0.txt | 100 ++++---- .../log_ggttgg_mad_d_inl1_hrd1.txt | 100 ++++---- .../log_ggttgg_mad_f_inl0_hrd0.txt | 100 ++++---- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 100 ++++---- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 100 ++++---- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 100 ++++---- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 100 ++++---- .../log_ggttgg_mad_f_inl0_hrd1.txt | 100 ++++---- .../log_ggttgg_mad_f_inl1_hrd0.txt | 100 ++++---- .../log_ggttgg_mad_f_inl1_hrd1.txt | 100 ++++---- .../log_ggttgg_mad_m_inl0_hrd0.txt | 100 ++++---- .../log_ggttgg_mad_m_inl0_hrd1.txt | 100 ++++---- .../log_ggttggg_mad_d_inl0_hrd0.txt | 100 ++++---- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 100 ++++---- .../log_ggttggg_mad_d_inl0_hrd1.txt | 100 ++++---- .../log_ggttggg_mad_f_inl0_hrd0.txt | 100 ++++---- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 100 ++++---- .../log_ggttggg_mad_f_inl0_hrd1.txt | 100 ++++---- .../log_ggttggg_mad_m_inl0_hrd0.txt | 100 ++++---- .../log_ggttggg_mad_m_inl0_hrd1.txt | 100 ++++---- .../log_gqttq_mad_d_inl0_hrd0.txt | 100 ++++---- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 100 ++++---- .../log_gqttq_mad_d_inl0_hrd1.txt | 100 ++++---- .../log_gqttq_mad_f_inl0_hrd0.txt | 100 ++++---- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 100 ++++---- .../log_gqttq_mad_f_inl0_hrd1.txt | 100 ++++---- .../log_gqttq_mad_m_inl0_hrd0.txt | 100 ++++---- .../log_gqttq_mad_m_inl0_hrd1.txt | 100 ++++---- 78 files changed, 3742 insertions(+), 3697 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 927b1eabba..4330c287c1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:00:38 +DATE: 2024-01-27_18:27:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.493535e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.336114e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.321735e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.460884e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.590509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143394e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.844492 sec - 2,822,572,603 cycles # 3.011 GHz - 4,441,343,230 instructions # 1.57 insn per cycle - 1.182647833 seconds time elapsed +TOTAL : 0.806184 sec + 2,658,194,401 cycles # 2.838 GHz + 4,112,400,936 instructions # 1.55 insn per cycle + 1.130813294 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.030733e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.195877e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.195877e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039825e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.208009e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.208009e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.506266 sec - 19,500,432,361 cycles # 2.995 GHz - 46,932,851,007 instructions # 2.41 insn per cycle - 6.522692471 seconds time elapsed +TOTAL : 6.449240 sec + 19,506,969,820 cycles # 3.023 GHz + 46,933,193,939 instructions # 2.41 insn per cycle + 6.458664542 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.673180e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.188384e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.188384e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633214e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.140541e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.140541e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.150845 sec - 12,810,023,668 cycles # 3.082 GHz - 31,183,348,105 instructions # 2.43 insn per cycle - 4.171384273 seconds time elapsed +TOTAL : 4.254028 sec + 12,841,479,967 cycles # 3.015 GHz + 31,185,544,346 instructions # 2.43 insn per cycle + 4.269731971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.081752e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.922153e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.922153e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.986234e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.781810e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.781810e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.412467 sec - 10,047,988,446 cycles # 2.940 GHz - 19,479,896,421 instructions # 1.94 insn per cycle - 3.432947741 seconds time elapsed +TOTAL : 3.574760 sec + 10,055,647,274 cycles # 2.809 GHz + 19,481,127,196 instructions # 1.94 insn per cycle + 3.590049555 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.191230e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.159424e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.159424e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.155866e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.107363e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.107363e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.265089 sec - 9,604,174,484 cycles # 2.937 GHz - 18,943,995,091 instructions # 1.97 insn per cycle - 3.287100859 seconds time elapsed +TOTAL : 3.317920 sec + 9,570,235,391 cycles # 2.879 GHz + 18,943,127,668 instructions # 1.98 insn per cycle + 3.334912589 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.012524e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.758193e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.758193e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.937915e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.653688e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.653688e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.517798 sec - 8,151,867,149 cycles # 2.314 GHz - 15,511,439,391 instructions # 1.90 insn per cycle - 3.537817962 seconds time elapsed +TOTAL : 3.645707 sec + 8,178,608,809 cycles # 2.240 GHz + 15,512,146,730 instructions # 1.90 insn per cycle + 3.661596044 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index dc73944f81..53b275c90b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:52:16 +DATE: 2024-01-27_19:20:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.672962e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.549889e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.549889e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.561948e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.477751e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.477751e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.223048 sec - 7,523,634,942 cycles # 3.040 GHz - 13,311,720,636 instructions # 1.77 insn per cycle - 2.532044795 seconds time elapsed +TOTAL : 2.278018 sec + 7,491,205,159 cycles # 2.971 GHz + 13,264,724,403 instructions # 1.77 insn per cycle + 2.579981356 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.010764e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168597e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.168597e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.004091e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.162639e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.162639e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.821589 sec - 20,757,853,710 cycles # 3.040 GHz - 47,159,545,853 instructions # 2.27 insn per cycle - 6.829251008 seconds time elapsed +TOTAL : 6.868532 sec + 20,782,502,647 cycles # 3.023 GHz + 47,159,656,287 instructions # 2.27 insn per cycle + 6.876357714 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.583280e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.036037e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.036037e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.539557e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.982485e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.982485e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.566197 sec - 14,096,396,378 cycles # 3.083 GHz - 32,025,240,520 instructions # 2.27 insn per cycle - 4.573676254 seconds time elapsed +TOTAL : 4.685255 sec + 14,050,607,376 cycles # 2.995 GHz + 32,025,794,268 instructions # 2.28 insn per cycle + 4.692691527 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.927660e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.635525e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.635525e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.907018e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.613110e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.613110e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.882075 sec - 11,344,329,644 cycles # 2.918 GHz - 20,845,338,644 instructions # 1.84 insn per cycle - 3.889536582 seconds time elapsed +TOTAL : 3.915539 sec + 11,287,161,118 cycles # 2.878 GHz + 20,844,870,086 instructions # 1.85 insn per cycle + 3.923278878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.057635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.879040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.879040e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.003365e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.805935e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.805935e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.662402 sec - 10,895,943,390 cycles # 2.970 GHz - 20,302,146,711 instructions # 1.86 insn per cycle - 3.669985539 seconds time elapsed +TOTAL : 3.758349 sec + 10,888,847,708 cycles # 2.893 GHz + 20,303,188,846 instructions # 1.86 insn per cycle + 3.766056385 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.870482e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.510766e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.510766e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.761057e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.355372e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.355372e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.977984 sec - 9,525,707,007 cycles # 2.392 GHz - 16,663,195,108 instructions # 1.75 insn per cycle - 3.985464256 seconds time elapsed +TOTAL : 4.206753 sec + 9,459,711,024 cycles # 2.252 GHz + 16,668,908,023 instructions # 1.76 insn per cycle + 4.214232398 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index c0050262b5..9ce6b6fe11 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-26_00:05:33 +DATE: 2024-01-27_19:34:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.496580e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.589305e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.145863e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.457550e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.577422e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.129022e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.312001 sec - 4,717,548,500 cycles # 3.046 GHz - 7,398,000,842 instructions # 1.57 insn per cycle - 1.605915075 seconds time elapsed +TOTAL : 1.335024 sec + 4,629,417,841 cycles # 2.946 GHz + 7,166,523,600 instructions # 1.55 insn per cycle + 1.629362314 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.045667e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.214537e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214537e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039853e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.208997e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.208997e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.771874 sec - 20,598,272,389 cycles # 3.041 GHz - 47,038,103,422 instructions # 2.28 insn per cycle - 6.778069785 seconds time elapsed +TOTAL : 6.812043 sec + 20,582,176,590 cycles # 3.020 GHz + 47,035,833,730 instructions # 2.29 insn per cycle + 6.818449406 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.635596e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.140656e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.140656e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.629024e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.135452e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135452e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.598785 sec - 13,886,226,766 cycles # 3.016 GHz - 31,185,926,736 instructions # 2.25 insn per cycle - 4.605167621 seconds time elapsed +TOTAL : 4.620925 sec + 13,913,704,075 cycles # 3.008 GHz + 31,185,324,761 instructions # 2.24 insn per cycle + 4.627444403 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.098367e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.936328e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.936328e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.989875e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.784851e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784851e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.742666 sec - 11,104,986,234 cycles # 2.963 GHz - 19,380,316,908 instructions # 1.75 insn per cycle - 3.748889817 seconds time elapsed +TOTAL : 3.941176 sec + 11,133,646,215 cycles # 2.821 GHz + 19,381,056,872 instructions # 1.74 insn per cycle + 3.947558254 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.185227e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.148360e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.148360e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.141027e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.094342e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.094342e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.636571 sec - 10,725,486,467 cycles # 2.946 GHz - 18,644,027,543 instructions # 1.74 insn per cycle - 3.642813824 seconds time elapsed +TOTAL : 3.715294 sec + 10,784,366,987 cycles # 2.900 GHz + 18,647,015,795 instructions # 1.73 insn per cycle + 3.721294535 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.992308e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.739374e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.739374e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.936713e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.650521e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.650521e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.918160 sec - 9,291,779,840 cycles # 2.370 GHz - 15,211,442,748 instructions # 1.64 insn per cycle - 3.924034280 seconds time elapsed +TOTAL : 4.022118 sec + 9,287,587,077 cycles # 2.307 GHz + 15,211,770,176 instructions # 1.64 insn per cycle + 4.028347644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index c769e281e3..836acf7957 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-26_00:02:16 +DATE: 2024-01-27_19:30:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.512464e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.620923e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.182081e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.470668e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.588254e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.100392e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.962741 sec - 3,604,601,594 cycles # 3.032 GHz - 7,140,731,707 instructions # 1.98 insn per cycle - 1.248405434 seconds time elapsed +TOTAL : 0.979876 sec + 3,586,199,558 cycles # 2.963 GHz + 7,128,330,253 instructions # 1.99 insn per cycle + 1.267817011 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.059764e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.231700e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.231700e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.014231e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.178496e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.178496e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.328275 sec - 19,517,485,230 cycles # 3.082 GHz - 46,932,064,028 instructions # 2.40 insn per cycle - 6.334453671 seconds time elapsed +TOTAL : 6.613489 sec + 19,532,347,142 cycles # 2.952 GHz + 46,936,306,541 instructions # 2.40 insn per cycle + 6.620356394 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.633059e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.135879e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.135879e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.633234e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.136276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.136276e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.247361 sec - 12,856,431,984 cycles # 3.029 GHz - 31,187,091,315 instructions # 2.43 insn per cycle - 4.253402484 seconds time elapsed +TOTAL : 4.251400 sec + 12,835,866,498 cycles # 3.016 GHz + 31,183,814,813 instructions # 2.43 insn per cycle + 4.257984131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.102059e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.944006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.944006e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.037351e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.863431e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.863431e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.383211 sec - 10,030,209,499 cycles # 2.961 GHz - 19,480,283,878 instructions # 1.94 insn per cycle - 3.389400574 seconds time elapsed +TOTAL : 3.486720 sec + 10,078,775,566 cycles # 2.886 GHz + 19,479,100,383 instructions # 1.93 insn per cycle + 3.493126449 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.210848e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.176162e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.176162e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.154891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.100039e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.100039e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.235982 sec - 9,574,277,596 cycles # 2.955 GHz - 18,943,319,025 instructions # 1.98 insn per cycle - 3.242098938 seconds time elapsed +TOTAL : 3.319080 sec + 9,600,829,018 cycles # 2.888 GHz + 18,941,966,116 instructions # 1.97 insn per cycle + 3.325208400 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.933692e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.640454e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.640454e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.935914e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.655345e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.655345e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.654789 sec - 8,148,850,269 cycles # 2.227 GHz - 15,511,167,758 instructions # 1.90 insn per cycle - 3.660729879 seconds time elapsed +TOTAL : 3.654207 sec + 8,183,104,435 cycles # 2.238 GHz + 15,511,507,696 instructions # 1.90 insn per cycle + 3.660556748 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 442e741920..14c7a5e7c6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:58:58 +DATE: 2024-01-27_19:27:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.003420e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.530385e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.003006e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.784353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.530206e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.003918e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.888085 sec - 6,237,367,974 cycles # 2.946 GHz - 11,479,672,260 instructions # 1.84 insn per cycle - 2.175346114 seconds time elapsed +TOTAL : 1.937918 sec + 6,278,717,730 cycles # 2.894 GHz + 11,472,658,869 instructions # 1.83 insn per cycle + 2.226188088 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.067013e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.239536e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.239536e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039291e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.207939e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.207939e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.287610 sec - 19,515,724,381 cycles # 3.102 GHz - 46,934,256,653 instructions # 2.40 insn per cycle - 6.293745010 seconds time elapsed +TOTAL : 6.454604 sec + 19,532,999,538 cycles # 3.024 GHz + 46,935,481,953 instructions # 2.40 insn per cycle + 6.461056975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667300e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.175747e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.175747e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.631706e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.134724e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.134724e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.160869 sec - 12,818,829,014 cycles # 3.077 GHz - 31,182,932,996 instructions # 2.43 insn per cycle - 4.167214283 seconds time elapsed +TOTAL : 4.254773 sec + 12,810,681,861 cycles # 3.007 GHz + 31,183,005,971 instructions # 2.43 insn per cycle + 4.260937327 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.103000e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.940953e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.940953e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.047550e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.879066e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.879066e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.381858 sec - 10,006,830,004 cycles # 2.955 GHz - 19,478,708,944 instructions # 1.95 insn per cycle - 3.387652439 seconds time elapsed +TOTAL : 3.473336 sec + 10,042,042,727 cycles # 2.887 GHz + 19,480,063,297 instructions # 1.94 insn per cycle + 3.479649151 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.228776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.203992e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.203992e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.111127e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.042846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.042846e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.213881 sec - 9,515,842,256 cycles # 2.956 GHz - 18,941,625,353 instructions # 1.99 insn per cycle - 3.220095449 seconds time elapsed +TOTAL : 3.382998 sec + 9,611,186,335 cycles # 2.840 GHz + 18,944,470,603 instructions # 1.97 insn per cycle + 3.389579987 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.018322e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.773276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.773276e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.931654e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635550e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635550e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.510174 sec - 8,143,935,761 cycles # 2.317 GHz - 15,510,805,308 instructions # 1.90 insn per cycle - 3.516440454 seconds time elapsed +TOTAL : 3.662797 sec + 8,145,684,875 cycles # 2.221 GHz + 15,512,034,536 instructions # 1.90 insn per cycle + 3.669369470 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 7e362de6ad..f52d4e00ed 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:01:13 +DATE: 2024-01-27_18:28:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.479753e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.302759e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.180666e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.471911e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.612195e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.193653e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.692528 sec - 2,765,261,426 cycles # 2.975 GHz - 4,347,441,638 instructions # 1.57 insn per cycle - 1.009702955 seconds time elapsed +TOTAL : 0.671427 sec + 2,669,234,037 cycles # 2.944 GHz + 4,137,486,194 instructions # 1.55 insn per cycle + 0.981732984 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.112704e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.304972e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.304972e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.106630e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.298019e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.298019e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.053723 sec - 18,438,643,690 cycles # 3.044 GHz - 44,718,807,490 instructions # 2.43 insn per cycle - 6.067208412 seconds time elapsed +TOTAL : 6.087988 sec + 18,395,527,837 cycles # 3.019 GHz + 44,716,201,853 instructions # 2.43 insn per cycle + 6.096178198 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.732933e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.291043e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.291043e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.696940e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.245448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.245448e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.019211 sec - 12,403,068,676 cycles # 3.082 GHz - 30,106,222,055 instructions # 2.43 insn per cycle - 4.041294858 seconds time elapsed +TOTAL : 4.105272 sec + 12,425,142,934 cycles # 3.023 GHz + 30,107,655,240 instructions # 2.42 insn per cycle + 4.121731739 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.073294e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.898621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.898621e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.029464e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.852722e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.852722e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.428696 sec - 10,138,654,204 cycles # 2.954 GHz - 19,114,972,544 instructions # 1.89 insn per cycle - 3.446242769 seconds time elapsed +TOTAL : 3.502232 sec + 10,140,699,058 cycles # 2.893 GHz + 19,115,428,299 instructions # 1.89 insn per cycle + 3.516474054 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.266572e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.279723e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.279723e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.193891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.198028e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.198028e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.167496 sec - 9,403,210,390 cycles # 2.964 GHz - 18,489,357,974 instructions # 1.97 insn per cycle - 3.185727133 seconds time elapsed +TOTAL : 3.264064 sec + 9,475,807,672 cycles # 2.898 GHz + 18,489,716,208 instructions # 1.95 insn per cycle + 3.281752367 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.411079e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.590354e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.590354e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.330207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.439946e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.439946e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.003146 sec - 7,196,870,284 cycles # 2.392 GHz - 13,864,300,606 instructions # 1.93 insn per cycle - 3.027427459 seconds time elapsed +TOTAL : 3.105240 sec + 7,166,370,615 cycles # 2.304 GHz + 13,864,882,407 instructions # 1.93 insn per cycle + 3.117484360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index db90974050..2449bf3ae3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:41:18 +DATE: 2024-01-27_19:09:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.485574e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.629060e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.159335e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.458496e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.569270e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.116381e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.668284 sec - 2,705,295,792 cycles # 3.008 GHz - 4,249,177,557 instructions # 1.57 insn per cycle - 0.961970052 seconds time elapsed +TOTAL : 0.677068 sec + 2,678,926,612 cycles # 2.934 GHz + 4,161,327,353 instructions # 1.55 insn per cycle + 0.970928697 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.450369e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.787521e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.787521e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.410042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.740847e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.740847e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.729727 sec - 14,595,480,674 cycles # 3.083 GHz - 36,696,587,452 instructions # 2.51 insn per cycle - 4.736310903 seconds time elapsed +TOTAL : 4.871448 sec + 14,605,683,453 cycles # 3.002 GHz + 36,698,911,392 instructions # 2.51 insn per cycle + 4.878202601 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.113447e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.010835e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.010835e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.076022e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.960696e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.960696e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.365946 sec - 10,359,930,578 cycles # 3.073 GHz - 24,752,972,702 instructions # 2.39 insn per cycle - 3.372469625 seconds time elapsed +TOTAL : 3.429016 sec + 10,365,417,662 cycles # 3.018 GHz + 24,752,936,476 instructions # 2.39 insn per cycle + 3.436031943 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.409493e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.590679e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.590679e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.353664e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.514455e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.514455e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.003707 sec - 8,879,004,825 cycles # 2.951 GHz - 16,955,050,761 instructions # 1.91 insn per cycle - 3.010173521 seconds time elapsed +TOTAL : 3.070316 sec + 8,872,607,166 cycles # 2.884 GHz + 16,955,269,127 instructions # 1.91 insn per cycle + 3.077512547 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.598956e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.029022e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.029022e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.542456e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.959279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.959279e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.810771 sec - 8,370,695,203 cycles # 2.973 GHz - 16,298,281,090 instructions # 1.95 insn per cycle - 2.817168029 seconds time elapsed +TOTAL : 2.874211 sec + 8,368,078,777 cycles # 2.906 GHz + 16,297,728,457 instructions # 1.95 insn per cycle + 2.880948455 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.113771e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.134014e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.134014e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.132511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.019948e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019948e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.368912 sec - 8,032,946,984 cycles # 2.381 GHz - 14,352,398,094 instructions # 1.79 insn per cycle - 3.375198940 seconds time elapsed +TOTAL : 3.353312 sec + 7,653,501,619 cycles # 2.279 GHz + 14,351,383,386 instructions # 1.88 insn per cycle + 3.360009404 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 730795b745..b4ec2d3a38 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:41:48 +DATE: 2024-01-27_19:10:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.486952e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.636507e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.196895e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.469420e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.602744e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.150892e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.663966 sec - 2,717,326,037 cycles # 3.017 GHz - 4,137,165,948 instructions # 1.52 insn per cycle - 0.963612691 seconds time elapsed +TOTAL : 0.671847 sec + 2,663,633,770 cycles # 2.936 GHz + 4,118,051,741 instructions # 1.55 insn per cycle + 0.967574090 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.037231e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.765883e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.765883e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.977799e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.694572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.694572e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.484428 sec - 10,787,558,593 cycles # 3.093 GHz - 28,354,504,883 instructions # 2.63 insn per cycle - 3.490670595 seconds time elapsed +TOTAL : 3.589012 sec + 10,801,142,359 cycles # 3.007 GHz + 28,357,477,472 instructions # 2.63 insn per cycle + 3.595542686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.402256e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.621682e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.621682e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.349686e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.544629e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.544629e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.008147 sec - 9,259,698,743 cycles # 3.075 GHz - 21,586,269,761 instructions # 2.33 insn per cycle - 3.014484526 seconds time elapsed +TOTAL : 3.077598 sec + 9,269,407,224 cycles # 3.007 GHz + 21,586,225,370 instructions # 2.33 insn per cycle + 3.084358676 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.585263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.966687e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.966687e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.477292e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.836027e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.836027e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.822869 sec - 8,394,492,587 cycles # 2.968 GHz - 15,943,662,560 instructions # 1.90 insn per cycle - 2.829238696 seconds time elapsed +TOTAL : 2.942679 sec + 8,447,551,404 cycles # 2.865 GHz + 15,943,888,519 instructions # 1.89 insn per cycle + 2.949434950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.835468e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.573667e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.573667e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.690819e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.403339e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.403339e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.606917 sec - 7,795,616,577 cycles # 2.984 GHz - 15,369,554,137 instructions # 1.97 insn per cycle - 2.613116887 seconds time elapsed +TOTAL : 2.736882 sec + 7,944,112,596 cycles # 2.897 GHz + 15,370,187,528 instructions # 1.93 insn per cycle + 2.743566336 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.251100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.274564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.274564e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.268572e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.306657e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.306657e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.195437 sec - 7,390,615,932 cycles # 2.316 GHz - 13,884,358,996 instructions # 1.88 insn per cycle - 3.201845767 seconds time elapsed +TOTAL : 3.173602 sec + 7,340,865,622 cycles # 2.309 GHz + 13,879,899,813 instructions # 1.89 insn per cycle + 3.180576626 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index d2c9307113..6f2bced9e0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:01:46 +DATE: 2024-01-27_18:29:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.094789e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088334e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.276470e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.298641e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.194659e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.296861e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.578518 sec - 2,407,118,979 cycles # 2.997 GHz - 3,725,510,179 instructions # 1.55 insn per cycle - 0.886415117 seconds time elapsed +TOTAL : 0.570967 sec + 2,334,592,448 cycles # 2.929 GHz + 3,632,363,421 instructions # 1.56 insn per cycle + 0.870945562 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.111589e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308134e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.308134e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.076073e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.264714e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264714e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.012216 sec - 18,550,637,771 cycles # 3.083 GHz - 47,045,980,285 instructions # 2.54 insn per cycle - 6.027733550 seconds time elapsed +TOTAL : 6.215762 sec + 18,583,152,636 cycles # 2.988 GHz + 47,046,423,322 instructions # 2.53 insn per cycle + 6.224321917 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.328667e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.560006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.560006e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.325879e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.548347e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.548347e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.058903 sec - 9,248,498,925 cycles # 3.019 GHz - 22,092,783,027 instructions # 2.39 insn per cycle - 3.077764087 seconds time elapsed +TOTAL : 3.060589 sec + 9,220,984,727 cycles # 3.007 GHz + 22,092,674,966 instructions # 2.40 insn per cycle + 3.078385978 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.639087e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.085268e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.085268e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.559835e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.974343e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.974343e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.736763 sec - 8,158,081,223 cycles # 2.975 GHz - 15,625,425,474 instructions # 1.92 insn per cycle - 2.755087289 seconds time elapsed +TOTAL : 2.818676 sec + 8,206,303,802 cycles # 2.906 GHz + 15,625,319,388 instructions # 1.90 insn per cycle + 2.834366031 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.770493e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.424122e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.424122e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.684772e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.307693e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.307693e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.620250 sec - 7,845,741,820 cycles # 2.988 GHz - 15,296,492,976 instructions # 1.95 insn per cycle - 2.642414775 seconds time elapsed +TOTAL : 2.703549 sec + 7,897,847,455 cycles # 2.916 GHz + 15,297,871,291 instructions # 1.94 insn per cycle + 2.719233537 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.780455e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.411688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.411688e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.690237e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.251244e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.251244e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.617695 sec - 6,398,515,491 cycles # 2.441 GHz - 12,623,800,106 instructions # 1.97 insn per cycle - 2.637090511 seconds time elapsed +TOTAL : 2.700056 sec + 6,406,470,129 cycles # 2.368 GHz + 12,623,925,199 instructions # 1.97 insn per cycle + 2.715604692 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 311dfe7d07..f22799bcd8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:52:54 +DATE: 2024-01-27_19:21:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.271050e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.518439e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.518439e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.091110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.370205e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.370205e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.663435 sec - 5,761,082,129 cycles # 3.046 GHz - 10,284,537,473 instructions # 1.79 insn per cycle - 1.948147743 seconds time elapsed +TOTAL : 1.693092 sec + 5,717,209,248 cycles # 2.967 GHz + 10,220,644,323 instructions # 1.79 insn per cycle + 1.983808985 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.077035e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.262740e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.262740e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.068065e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.251615e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.251615e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.304919 sec - 19,230,718,710 cycles # 3.049 GHz - 47,196,758,833 instructions # 2.45 insn per cycle - 6.311821481 seconds time elapsed +TOTAL : 6.360320 sec + 19,222,492,344 cycles # 3.020 GHz + 47,195,059,630 instructions # 2.46 insn per cycle + 6.368180474 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.274745e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.400943e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.400943e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.240125e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345873e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.240921 sec - 10,027,883,999 cycles # 3.090 GHz - 23,431,081,239 instructions # 2.34 insn per cycle - 3.247768707 seconds time elapsed +TOTAL : 3.287034 sec + 9,979,773,014 cycles # 3.031 GHz + 23,429,352,506 instructions # 2.35 insn per cycle + 3.294344601 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.506974e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.804251e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.804251e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.465891e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742130e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742130e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.990495 sec - 8,913,858,036 cycles # 2.975 GHz - 16,750,811,061 instructions # 1.88 insn per cycle - 2.997326374 seconds time elapsed +TOTAL : 3.037490 sec + 8,892,438,239 cycles # 2.922 GHz + 16,751,131,896 instructions # 1.88 insn per cycle + 3.044961686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.600687e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.056945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.056945e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.544387e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.966168e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.966168e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.895236 sec - 8,649,844,179 cycles # 2.982 GHz - 16,422,517,147 instructions # 1.90 insn per cycle - 2.902094725 seconds time elapsed +TOTAL : 2.957079 sec + 8,640,458,152 cycles # 2.916 GHz + 16,423,625,949 instructions # 1.90 insn per cycle + 2.964471548 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.596837e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.005170e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.005170e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.457706e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.744955e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.744955e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.901194 sec - 7,177,929,467 cycles # 2.469 GHz - 13,849,422,404 instructions # 1.93 insn per cycle - 2.908162890 seconds time elapsed +TOTAL : 3.055377 sec + 7,136,558,861 cycles # 2.331 GHz + 13,849,895,297 instructions # 1.94 insn per cycle + 3.062598456 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 5fd851a374..45389c409a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-26_00:06:09 +DATE: 2024-01-27_19:34:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.306357e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.179100e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.261443e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.299345e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.174091e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247451e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.147708 sec - 4,163,766,116 cycles # 3.042 GHz - 6,626,945,716 instructions # 1.59 insn per cycle - 1.426181632 seconds time elapsed +TOTAL : 1.177271 sec + 4,143,091,386 cycles # 2.965 GHz + 6,559,856,203 instructions # 1.58 insn per cycle + 1.455622740 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.110518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.305248e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.305248e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.075887e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.265834e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.265834e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.337609 sec - 19,555,432,676 cycles # 3.083 GHz - 47,227,213,648 instructions # 2.42 insn per cycle - 6.343559467 seconds time elapsed +TOTAL : 6.556546 sec + 19,614,107,542 cycles # 2.990 GHz + 47,234,086,077 instructions # 2.41 insn per cycle + 6.562589182 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.385516e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.648026e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.648026e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.322270e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.543513e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.543513e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.298500 sec - 10,227,348,695 cycles # 3.096 GHz - 22,172,051,154 instructions # 2.17 insn per cycle - 3.304380352 seconds time elapsed +TOTAL : 3.397249 sec + 10,249,413,020 cycles # 3.013 GHz + 22,172,282,512 instructions # 2.16 insn per cycle + 3.403512517 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.622613e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.085167e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.085167e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.554657e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.975014e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.975014e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.075553 sec - 9,160,418,803 cycles # 2.975 GHz - 15,535,398,337 instructions # 1.70 insn per cycle - 3.081224998 seconds time elapsed +TOTAL : 3.159703 sec + 9,187,571,116 cycles # 2.903 GHz + 15,535,454,607 instructions # 1.69 insn per cycle + 3.165655811 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.735922e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.391720e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.391720e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.670952e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.286921e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.286921e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.980114 sec - 8,899,453,850 cycles # 2.982 GHz - 15,005,353,105 instructions # 1.69 insn per cycle - 2.985888152 seconds time elapsed +TOTAL : 3.060873 sec + 8,923,018,524 cycles # 2.911 GHz + 15,006,649,649 instructions # 1.68 insn per cycle + 3.067260470 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.766878e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.403077e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.403077e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.684276e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.241659e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.241659e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.949550 sec - 7,417,056,376 cycles # 2.511 GHz - 12,332,428,558 instructions # 1.66 insn per cycle - 2.955404265 seconds time elapsed +TOTAL : 3.051374 sec + 7,450,470,409 cycles # 2.438 GHz + 12,333,404,291 instructions # 1.66 insn per cycle + 3.057604398 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index b6c30db0cb..e73503aded 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-26_00:02:50 +DATE: 2024-01-27_19:31:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.312381e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.184733e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.266952e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.302768e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181644e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262369e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.835122 sec - 3,197,729,804 cycles # 3.031 GHz - 6,511,347,801 instructions # 2.04 insn per cycle - 1.113725741 seconds time elapsed +TOTAL : 0.866658 sec + 3,108,411,536 cycles # 2.855 GHz + 6,348,867,669 instructions # 2.04 insn per cycle + 1.146196691 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.111142e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.305911e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.305911e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.088138e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.278620e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.278620e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.017248 sec - 18,556,545,570 cycles # 3.084 GHz - 47,048,067,347 instructions # 2.54 insn per cycle - 6.023231625 seconds time elapsed +TOTAL : 6.146091 sec + 18,574,305,773 cycles # 3.020 GHz + 47,046,243,044 instructions # 2.53 insn per cycle + 6.152109127 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.376972e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.665333e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.665333e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.274001e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.460512e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.460512e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.995990 sec - 9,272,557,506 cycles # 3.090 GHz - 22,091,542,289 instructions # 2.38 insn per cycle - 3.001967287 seconds time elapsed +TOTAL : 3.132795 sec + 9,215,263,646 cycles # 2.939 GHz + 22,089,271,619 instructions # 2.40 insn per cycle + 3.138223391 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.607052e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.055871e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.055871e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.567359e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.986612e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.986612e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.769502 sec - 8,191,039,582 cycles # 2.953 GHz - 15,625,047,168 instructions # 1.91 insn per cycle - 2.775572911 seconds time elapsed +TOTAL : 2.814203 sec + 8,177,091,662 cycles # 2.901 GHz + 15,624,445,850 instructions # 1.91 insn per cycle + 2.820203735 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.750343e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.394522e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.394522e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.681978e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.294941e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.294941e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.637756 sec - 7,859,183,641 cycles # 2.974 GHz - 15,295,681,899 instructions # 1.95 insn per cycle - 2.643542638 seconds time elapsed +TOTAL : 2.704822 sec + 7,875,784,738 cycles # 2.906 GHz + 15,295,996,369 instructions # 1.94 insn per cycle + 2.710825901 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.768753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.394838e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.394838e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.674756e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.219292e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.219292e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.627373 sec - 6,404,305,674 cycles # 2.433 GHz - 12,622,889,798 instructions # 1.97 insn per cycle - 2.633130133 seconds time elapsed +TOTAL : 2.714087 sec + 6,412,065,146 cycles # 2.358 GHz + 12,623,099,114 instructions # 1.97 insn per cycle + 2.720288344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index a5f5742ed1..5d7294e83b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:59:33 +DATE: 2024-01-27_19:28:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.289424e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155625e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.176626e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.932152e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107082e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.127630e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.442027 sec - 5,072,798,131 cycles # 3.044 GHz - 9,207,644,948 instructions # 1.82 insn per cycle - 1.725918459 seconds time elapsed +TOTAL : 1.482237 sec + 5,093,185,858 cycles # 2.977 GHz + 9,216,243,099 instructions # 1.81 insn per cycle + 1.767188248 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.100030e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.293407e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.293407e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.079462e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.279176e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.279176e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.076797 sec - 18,544,861,337 cycles # 3.050 GHz - 47,047,973,845 instructions # 2.54 insn per cycle - 6.082802773 seconds time elapsed +TOTAL : 6.194746 sec + 18,699,626,867 cycles # 3.016 GHz + 47,046,281,728 instructions # 2.52 insn per cycle + 6.200980585 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.393613e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.639831e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.639831e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.311625e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528131e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528131e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.972935 sec - 9,226,107,763 cycles # 3.099 GHz - 22,091,523,964 instructions # 2.39 insn per cycle - 2.978809408 seconds time elapsed +TOTAL : 3.084622 sec + 9,236,990,240 cycles # 2.992 GHz + 22,092,949,356 instructions # 2.39 insn per cycle + 3.090816103 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.634367e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.094236e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.094236e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.536730e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.937077e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937077e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.744672 sec - 8,169,489,880 cycles # 2.974 GHz - 15,625,708,382 instructions # 1.91 insn per cycle - 2.750593449 seconds time elapsed +TOTAL : 2.847164 sec + 8,175,263,272 cycles # 2.867 GHz + 15,626,202,398 instructions # 1.91 insn per cycle + 2.853472032 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.770473e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.445882e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.445882e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.677071e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.284453e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.284453e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.618890 sec - 7,856,839,853 cycles # 2.995 GHz - 15,295,667,717 instructions # 1.95 insn per cycle - 2.624973280 seconds time elapsed +TOTAL : 2.710590 sec + 7,878,414,326 cycles # 2.901 GHz + 15,295,945,032 instructions # 1.94 insn per cycle + 2.716499916 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.737654e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.348071e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.348071e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.677827e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.225134e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.225134e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.652862 sec - 6,406,807,661 cycles # 2.412 GHz - 12,625,033,778 instructions # 1.97 insn per cycle - 2.658673017 seconds time elapsed +TOTAL : 2.710843 sec + 6,424,470,372 cycles # 2.366 GHz + 12,623,450,760 instructions # 1.96 insn per cycle + 2.716911235 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index c35cf61378..fb199146e4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:02:16 +DATE: 2024-01-27_18:29:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.098241e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096988e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.338544e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.298570e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200393e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328660e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.574179 sec - 2,430,883,292 cycles # 2.998 GHz - 3,761,995,439 instructions # 1.55 insn per cycle - 0.889968607 seconds time elapsed +TOTAL : 0.567315 sec + 2,331,905,478 cycles # 2.937 GHz + 3,641,453,783 instructions # 1.56 insn per cycle + 0.866083617 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.166790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.383742e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.383742e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.143460e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.358554e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.358554e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.746400 sec - 17,725,056,212 cycles # 3.082 GHz - 43,885,367,704 instructions # 2.48 insn per cycle - 5.762368432 seconds time elapsed +TOTAL : 5.860932 sec + 17,746,278,890 cycles # 3.026 GHz + 43,887,716,368 instructions # 2.47 insn per cycle + 5.869103029 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.382827e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.685380e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.685380e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.385096e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.700933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.700933e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.997864 sec - 9,048,620,342 cycles # 3.013 GHz - 21,582,904,348 instructions # 2.39 insn per cycle - 3.019845696 seconds time elapsed +TOTAL : 2.994427 sec + 9,034,947,440 cycles # 3.012 GHz + 21,581,997,443 instructions # 2.39 insn per cycle + 3.009305656 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.645891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.121304e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.121304e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.569603e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.998364e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.998364e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.727469 sec - 8,091,878,793 cycles # 2.961 GHz - 15,428,989,777 instructions # 1.91 insn per cycle - 2.750857146 seconds time elapsed +TOTAL : 2.814567 sec + 8,181,606,187 cycles # 2.903 GHz + 15,432,175,910 instructions # 1.89 insn per cycle + 2.829746470 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.751076e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.395522e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.395522e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.686786e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.309382e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.309382e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.640357 sec - 7,849,064,666 cycles # 2.967 GHz - 15,086,994,011 instructions # 1.92 insn per cycle - 2.659695567 seconds time elapsed +TOTAL : 2.704117 sec + 7,855,075,052 cycles # 2.899 GHz + 15,087,119,018 instructions # 1.92 insn per cycle + 2.721017005 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.940902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.834293e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.834293e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.750668e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.464689e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.464689e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.491118 sec - 6,165,763,719 cycles # 2.470 GHz - 12,244,016,638 instructions # 1.99 insn per cycle - 2.510253485 seconds time elapsed +TOTAL : 2.657551 sec + 6,188,907,055 cycles # 2.325 GHz + 12,247,038,736 instructions # 1.98 insn per cycle + 2.670267656 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index e90a5e24b2..91bd8b8a95 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:42:17 +DATE: 2024-01-27_19:10:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.297942e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.191141e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.279372e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.300007e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.186316e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.282085e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.561987 sec - 2,359,900,397 cycles # 3.010 GHz - 3,659,136,851 instructions # 1.55 insn per cycle - 0.842862484 seconds time elapsed +TOTAL : 0.569021 sec + 2,334,082,963 cycles # 2.936 GHz + 3,662,913,290 instructions # 1.57 insn per cycle + 0.853460362 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.483786e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.861169e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.861169e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.487465e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.868119e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.868119e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.663846 sec - 13,962,650,647 cycles # 2.991 GHz - 37,848,484,392 instructions # 2.71 insn per cycle - 4.670004365 seconds time elapsed +TOTAL : 4.589418 sec + 13,761,744,502 cycles # 2.995 GHz + 37,848,018,392 instructions # 2.75 insn per cycle + 4.595713118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.866159e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.882284e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.882284e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.784278e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.740957e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.740957e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.542363 sec - 7,896,880,722 cycles # 3.100 GHz - 18,602,696,675 instructions # 2.36 insn per cycle - 2.548302756 seconds time elapsed +TOTAL : 2.614438 sec + 7,924,064,298 cycles # 3.025 GHz + 18,603,596,851 instructions # 2.35 insn per cycle + 2.621027066 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.966287e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.951221e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.951221e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.819736e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.685571e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.685571e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.468293 sec - 7,420,929,664 cycles # 3.001 GHz - 14,338,774,343 instructions # 1.93 insn per cycle - 2.474190802 seconds time elapsed +TOTAL : 2.599024 sec + 7,413,580,157 cycles # 2.847 GHz + 14,339,699,168 instructions # 1.93 insn per cycle + 2.605472047 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.030042e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.163232e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.163232e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.938769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.994926e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.994926e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.425778 sec - 7,309,271,422 cycles # 3.007 GHz - 13,954,124,612 instructions # 1.91 insn per cycle - 2.432103848 seconds time elapsed +TOTAL : 2.497547 sec + 7,313,071,613 cycles # 2.922 GHz + 13,954,423,658 instructions # 1.91 insn per cycle + 2.504054684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.883579e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.679418e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.679418e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.785331e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.502319e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.502319e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.534153 sec - 6,266,746,932 cycles # 2.468 GHz - 13,208,183,134 instructions # 2.11 insn per cycle - 2.540179172 seconds time elapsed +TOTAL : 2.620000 sec + 6,277,230,496 cycles # 2.391 GHz + 13,208,120,155 instructions # 2.10 insn per cycle + 2.626514632 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 469aa8ffd2..8eea08f24e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:42:45 +DATE: 2024-01-27_19:11:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.304170e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209333e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.342098e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.295810e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.202482e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.333937e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.557799 sec - 2,362,597,179 cycles # 3.024 GHz - 3,707,659,448 instructions # 1.57 insn per cycle - 0.839215126 seconds time elapsed +TOTAL : 0.565983 sec + 2,308,536,588 cycles # 2.913 GHz + 3,558,580,353 instructions # 1.54 insn per cycle + 0.849931283 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.132330e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.983314e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.983314e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.086474e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.918622e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.918622e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.300471 sec - 10,112,740,962 cycles # 3.060 GHz - 28,398,804,011 instructions # 2.81 insn per cycle - 3.306398800 seconds time elapsed +TOTAL : 3.373319 sec + 10,128,204,715 cycles # 2.998 GHz + 28,399,238,885 instructions # 2.80 insn per cycle + 3.379817629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.109998e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.743632e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.743632e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.064441e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.600432e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.600432e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.369463 sec - 7,338,726,391 cycles # 3.091 GHz - 16,786,306,106 instructions # 2.29 insn per cycle - 2.375667430 seconds time elapsed +TOTAL : 2.411342 sec + 7,296,023,056 cycles # 3.019 GHz + 16,785,936,380 instructions # 2.30 insn per cycle + 2.418115810 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.103514e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.383052e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.383052e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.033566e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.263699e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.263699e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.379197 sec - 7,096,164,424 cycles # 2.976 GHz - 13,728,890,269 instructions # 1.93 insn per cycle - 2.385413338 seconds time elapsed +TOTAL : 2.433739 sec + 7,119,919,933 cycles # 2.919 GHz + 13,729,208,550 instructions # 1.93 insn per cycle + 2.440430559 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.095407e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.429923e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.429923e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.014449e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.261345e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.261345e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.384377 sec - 7,073,164,896 cycles # 2.961 GHz - 13,460,571,389 instructions # 1.90 insn per cycle - 2.390322761 seconds time elapsed +TOTAL : 2.447471 sec + 7,063,168,115 cycles # 2.883 GHz + 13,462,876,791 instructions # 1.91 insn per cycle + 2.454197899 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.020353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.067711e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.067711e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.900977e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.825415e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.825415e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.437257 sec - 6,059,792,230 cycles # 2.481 GHz - 12,910,342,390 instructions # 2.13 insn per cycle - 2.443163164 seconds time elapsed +TOTAL : 2.531713 sec + 6,065,556,227 cycles # 2.391 GHz + 12,910,852,886 instructions # 2.13 insn per cycle + 2.538092940 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index ca059fc445..832212d518 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:02:46 +DATE: 2024-01-27_18:30:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.466288e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.337541e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.171849e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.454943e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.580243e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.121312e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.690524 sec - 2,792,013,447 cycles # 3.009 GHz - 4,251,755,468 instructions # 1.52 insn per cycle - 1.010200043 seconds time elapsed +TOTAL : 0.672615 sec + 2,659,516,695 cycles # 2.930 GHz + 4,189,600,657 instructions # 1.58 insn per cycle + 0.981568577 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051297e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.219822e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.219822e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.017729e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180513e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180513e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.376369 sec - 19,701,785,180 cycles # 3.087 GHz - 46,970,070,471 instructions # 2.38 insn per cycle - 6.391151099 seconds time elapsed +TOTAL : 6.582975 sec + 19,745,039,576 cycles # 2.997 GHz + 46,971,043,157 instructions # 2.38 insn per cycle + 6.591430351 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.700597e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.241163e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.241163e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.645408e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.170213e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.170213e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.092874 sec - 12,523,019,197 cycles # 3.056 GHz - 30,922,016,203 instructions # 2.47 insn per cycle - 4.113499563 seconds time elapsed +TOTAL : 4.230014 sec + 12,493,958,485 cycles # 2.949 GHz + 30,922,835,853 instructions # 2.48 insn per cycle + 4.249036641 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.026909e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.810880e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.810880e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.995238e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.784735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784735e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.497994 sec - 10,190,173,371 cycles # 2.908 GHz - 19,546,489,451 instructions # 1.92 insn per cycle - 3.518050330 seconds time elapsed +TOTAL : 3.553540 sec + 10,262,583,129 cycles # 2.883 GHz + 19,549,264,327 instructions # 1.90 insn per cycle + 3.571464047 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.184526e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.112783e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.112783e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.124505e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.036900e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.036900e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.268706 sec - 9,701,390,537 cycles # 2.963 GHz - 18,858,558,794 instructions # 1.94 insn per cycle - 3.287817502 seconds time elapsed +TOTAL : 3.360416 sec + 9,705,312,606 cycles # 2.883 GHz + 18,859,288,785 instructions # 1.94 insn per cycle + 3.374514912 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.037405e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.812464e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.812464e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.964752e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.703026e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.703026e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.479409 sec - 8,096,936,975 cycles # 2.323 GHz - 14,812,944,511 instructions # 1.83 insn per cycle - 3.497975564 seconds time elapsed +TOTAL : 3.605664 sec + 8,110,671,606 cycles # 2.246 GHz + 14,814,965,569 instructions # 1.83 insn per cycle + 3.618037534 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 4f31d9a367..d9aa3524ae 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-25_23:03:20 +DATE: 2024-01-27_18:30:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.447758e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.281551e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.150988e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.450313e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.588685e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.109937e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.687736 sec - 2,747,370,286 cycles # 2.982 GHz - 4,268,924,942 instructions # 1.55 insn per cycle - 1.008068709 seconds time elapsed +TOTAL : 0.674384 sec + 2,670,081,298 cycles # 2.940 GHz + 4,170,136,902 instructions # 1.56 insn per cycle + 0.981068595 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.125377e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.320065e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.320065e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.097800e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.287094e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.287094e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.983111 sec - 18,512,223,700 cycles # 3.092 GHz - 44,592,936,775 instructions # 2.41 insn per cycle - 5.995435786 seconds time elapsed +TOTAL : 6.133979 sec + 18,525,337,668 cycles # 3.018 GHz + 44,592,174,478 instructions # 2.41 insn per cycle + 6.142638569 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.759698e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.343343e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.343343e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.703487e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.266234e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.266234e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.967957 sec - 12,217,647,849 cycles # 3.075 GHz - 30,216,214,748 instructions # 2.47 insn per cycle - 3.985701544 seconds time elapsed +TOTAL : 4.099663 sec + 12,180,659,451 cycles # 2.973 GHz + 30,220,479,220 instructions # 2.48 insn per cycle + 4.117142209 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.022777e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.815435e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.815435e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.986943e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.775011e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.775011e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.505034 sec - 10,165,427,376 cycles # 2.895 GHz - 19,036,303,321 instructions # 1.87 insn per cycle - 3.526699091 seconds time elapsed +TOTAL : 3.571308 sec + 10,218,424,056 cycles # 2.857 GHz + 19,038,472,456 instructions # 1.86 insn per cycle + 3.585618618 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.205707e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.176082e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.176082e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.155033e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.101436e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.101436e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.241628 sec - 9,598,562,455 cycles # 2.956 GHz - 18,451,811,313 instructions # 1.92 insn per cycle - 3.263864774 seconds time elapsed +TOTAL : 3.316742 sec + 9,589,461,377 cycles # 2.886 GHz + 18,452,385,566 instructions # 1.92 insn per cycle + 3.332340390 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.352591e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.472374e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.472374e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.319997e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.417969e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.417969e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.070635 sec - 7,200,806,955 cycles # 2.341 GHz - 13,242,495,009 instructions # 1.84 insn per cycle - 3.093967831 seconds time elapsed +TOTAL : 3.114149 sec + 7,202,313,446 cycles # 2.308 GHz + 13,242,868,760 instructions # 1.84 insn per cycle + 3.127979565 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index bf7ba27714..1d57a488e8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_14:46:58 +DATE: 2024-01-27_18:31:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.404499e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.288002e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.325396e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.637562 sec - 3,236,775,925 cycles:u # 1.957 GHz (74.95%) - 10,679,224 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.83%) - 1,160,637,440 stalled-cycles-backend:u # 35.86% backend cycles idle (75.18%) - 3,096,350,011 instructions:u # 0.96 insn per cycle - # 0.37 stalled cycles per insn (74.24%) - 2.326736373 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.551731e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158111e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273591e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.525728 sec + 2,253,076,627 cycles # 2.922 GHz + 3,207,655,601 instructions # 1.42 insn per cycle + 0.846879808 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.518940e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.583706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.583706e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.342627 sec - 14,964,849,700 cycles:u # 3.421 GHz (74.98%) - 9,503,935 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) - 82,033,110 stalled-cycles-backend:u # 0.55% backend cycles idle (74.88%) - 38,338,800,684 instructions:u # 2.56 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 4.381488281 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.135550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.199605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.199605e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.018432 sec + 14,961,002,228 cycles # 2.978 GHz + 38,722,736,643 instructions # 2.59 insn per cycle + 5.027069823 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.501722e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.730702e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.730702e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.508724 sec - 8,574,921,805 cycles:u # 3.374 GHz (74.91%) - 9,628,469 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.84%) - 638,187,821 stalled-cycles-backend:u # 7.44% backend cycles idle (74.87%) - 24,227,857,787 instructions:u # 2.83 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 2.549619793 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2003) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.629750e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.834785e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.834785e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.997221 sec + 8,956,048,148 cycles # 2.983 GHz + 24,430,777,255 instructions # 2.73 insn per cycle + 3.011580672 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.751144e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.359339e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.359339e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.532068 sec - 5,123,766,171 cycles:u # 3.273 GHz (75.04%) - 8,652,731 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) - 1,100,091,964 stalled-cycles-backend:u # 21.47% backend cycles idle (74.75%) - 11,390,920,751 instructions:u # 2.22 insn per cycle - # 0.10 stalled cycles per insn (74.76%) - 9.287713600 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2248) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.732472e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.230367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.230367e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.937260 sec + 5,537,687,120 cycles # 2.850 GHz + 11,562,218,639 instructions # 2.09 insn per cycle + 1.954475286 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.676829e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.369329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.369329e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.676387 sec + 4,810,331,929 cycles # 2.859 GHz + 10,339,255,035 instructions # 2.15 insn per cycle + 1.693288563 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.334499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.612056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.612056e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.527112 sec + 4,949,555,328 cycles # 1.954 GHz + 7,556,291,004 instructions # 1.53 insn per cycle + 2.542441624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 86ee3f4362..f32c05e165 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:53:26 +DATE: 2024-01-27_19:21:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.509605e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.892994e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.892994e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.443181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.836789e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.836789e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.806428 sec - 3,162,222,779 cycles # 3.017 GHz - 4,862,998,171 instructions # 1.54 insn per cycle - 1.106788590 seconds time elapsed +TOTAL : 0.815552 sec + 3,152,223,253 cycles # 2.950 GHz + 4,839,428,338 instructions # 1.54 insn per cycle + 1.127787770 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.170652e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.233557e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.139565e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.202609e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.202609e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.012783 sec - 15,309,924,583 cycles # 3.051 GHz - 38,782,435,884 instructions # 2.53 insn per cycle - 5.020172650 seconds time elapsed +TOTAL : 5.087770 sec + 15,324,937,529 cycles # 3.010 GHz + 38,785,835,079 instructions # 2.53 insn per cycle + 5.095438900 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.598372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.795378e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.795378e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.642792e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.847282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.847282e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.097874 sec - 9,295,060,871 cycles # 2.994 GHz - 24,612,123,964 instructions # 2.65 insn per cycle - 3.105584985 seconds time elapsed +TOTAL : 3.061852 sec + 9,297,134,773 cycles # 3.030 GHz + 24,611,929,147 instructions # 2.65 insn per cycle + 3.069748837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.795739e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.286479e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.286479e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.618551e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.100275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.100275e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.992875 sec - 5,864,013,930 cycles # 2.933 GHz - 11,848,692,419 instructions # 2.02 insn per cycle - 2.000318144 seconds time elapsed +TOTAL : 2.052297 sec + 5,885,039,003 cycles # 2.858 GHz + 11,848,510,845 instructions # 2.01 insn per cycle + 2.060062165 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.666318e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.334357e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.334357e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.504273e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.161017e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.161017e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.756250 sec - 5,157,360,305 cycles # 2.926 GHz - 10,625,342,990 instructions # 2.06 insn per cycle - 1.763857171 seconds time elapsed +TOTAL : 1.799191 sec + 5,170,745,323 cycles # 2.863 GHz + 10,625,305,495 instructions # 2.05 insn per cycle + 1.807054488 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.395537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.676486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.676486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.143672e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.398027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.398027e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.574747 sec - 5,305,772,734 cycles # 2.057 GHz - 7,798,848,434 instructions # 1.47 insn per cycle - 2.582142171 seconds time elapsed +TOTAL : 2.719298 sec + 5,299,827,945 cycles # 1.944 GHz + 7,799,359,597 instructions # 1.47 insn per cycle + 2.727100647 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 8c7873c180..ac84fb1512 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-26_00:06:41 +DATE: 2024-01-27_19:35:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.579482e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159880e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276260e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.553160e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155068e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270145e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.612727 sec - 2,531,994,631 cycles # 3.010 GHz - 3,691,767,465 instructions # 1.46 insn per cycle - 0.900330496 seconds time elapsed +TOTAL : 0.621735 sec + 2,489,043,210 cycles # 2.929 GHz + 3,562,840,484 instructions # 1.43 insn per cycle + 0.909390238 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.169589e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.235370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.235370e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.159128e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.223397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.223397e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.000244 sec - 15,161,594,214 cycles # 3.030 GHz - 38,738,913,418 instructions # 2.56 insn per cycle - 5.006606783 seconds time elapsed +TOTAL : 5.024025 sec + 15,162,088,953 cycles # 3.015 GHz + 38,738,505,601 instructions # 2.55 insn per cycle + 5.030201813 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.765286e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.975710e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.975710e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.533105e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.728807e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728807e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.947798 sec - 9,138,741,572 cycles # 3.095 GHz - 24,428,627,016 instructions # 2.67 insn per cycle - 2.953976947 seconds time elapsed +TOTAL : 3.136715 sec + 9,148,263,180 cycles # 2.918 GHz + 24,432,145,400 instructions # 2.67 insn per cycle + 3.143009438 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.856433e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.373968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.373968e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.704073e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.211191e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.211191e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.953801 sec - 5,726,373,065 cycles # 2.924 GHz - 11,543,837,156 instructions # 2.02 insn per cycle - 1.960034141 seconds time elapsed +TOTAL : 2.008048 sec + 5,721,668,107 cycles # 2.843 GHz + 11,545,118,469 instructions # 2.02 insn per cycle + 2.014569715 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.721784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.406724e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.406724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.616897e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.301045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.301045e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.724743 sec - 5,007,833,498 cycles # 2.896 GHz - 10,287,819,259 instructions # 2.05 insn per cycle - 1.730885282 seconds time elapsed +TOTAL : 1.752962 sec + 5,013,464,915 cycles # 2.851 GHz + 10,287,485,495 instructions # 2.05 insn per cycle + 1.759537330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.285189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.557330e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.557330e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.330900e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.607229e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.607229e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.612237 sec - 5,141,607,567 cycles # 1.965 GHz - 7,503,456,514 instructions # 1.46 insn per cycle - 2.618574526 seconds time elapsed +TOTAL : 2.590954 sec + 5,124,956,809 cycles # 1.974 GHz + 7,502,866,606 instructions # 1.46 insn per cycle + 2.597313801 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 26275483e7..8419d20f23 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-26_00:03:20 +DATE: 2024-01-27_19:32:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.572661e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157522e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274817e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.546501e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155022e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270969e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.553883 sec - 2,339,299,389 cycles # 2.989 GHz - 3,658,021,436 instructions # 1.56 insn per cycle - 0.840596463 seconds time elapsed +TOTAL : 0.562633 sec + 2,295,587,413 cycles # 2.889 GHz + 3,520,473,490 instructions # 1.53 insn per cycle + 0.852379916 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.215810e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.281977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.281977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.166522e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.231205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.231205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.837164 sec - 14,973,378,568 cycles # 3.093 GHz - 38,722,169,138 instructions # 2.59 insn per cycle - 4.843282809 seconds time elapsed +TOTAL : 4.947217 sec + 14,985,234,554 cycles # 3.027 GHz + 38,723,944,283 instructions # 2.58 insn per cycle + 4.953553340 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.746637e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.962623e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.962623e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.688654e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.894194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.894194e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.904163 sec - 8,949,658,023 cycles # 3.076 GHz - 24,428,241,097 instructions # 2.73 insn per cycle - 2.910306365 seconds time elapsed +TOTAL : 2.948182 sec + 8,953,378,937 cycles # 3.032 GHz + 24,428,439,372 instructions # 2.73 insn per cycle + 2.954516821 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.861320e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.368611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.368611e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.719036e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.215680e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.215680e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.893700 sec - 5,539,396,489 cycles # 2.918 GHz - 11,561,772,559 instructions # 2.09 insn per cycle - 1.899998499 seconds time elapsed +TOTAL : 1.940059 sec + 5,554,282,154 cycles # 2.855 GHz + 11,561,246,559 instructions # 2.08 insn per cycle + 1.946824789 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.785716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.473840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.473840e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.429208e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.087522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.087522e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.648994 sec - 4,833,459,743 cycles # 2.922 GHz - 10,338,331,915 instructions # 2.14 insn per cycle - 1.655414957 seconds time elapsed +TOTAL : 1.739475 sec + 4,812,378,358 cycles # 2.758 GHz + 10,338,594,579 instructions # 2.15 insn per cycle + 1.745816916 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.483832e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.775524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.775524e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.316145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.588918e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.588918e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.443598 sec - 4,947,694,251 cycles # 2.021 GHz - 7,552,954,202 instructions # 1.53 insn per cycle - 2.449709143 seconds time elapsed +TOTAL : 2.536864 sec + 4,945,627,878 cycles # 1.945 GHz + 7,553,585,636 instructions # 1.53 insn per cycle + 2.543212075 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e47f2f66a0..fafe86fb7f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-26_00:00:04 +DATE: 2024-01-27_19:28:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.896900e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159088e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275420e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.803191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152740e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269734e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.701788 sec - 2,806,947,514 cycles # 3.016 GHz - 4,362,797,301 instructions # 1.55 insn per cycle - 0.987803789 seconds time elapsed +TOTAL : 0.709880 sec + 2,799,296,914 cycles # 2.939 GHz + 4,368,443,555 instructions # 1.56 insn per cycle + 1.010847274 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247112e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247112e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.160840e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.225195e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.225195e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.907373 sec - 14,988,749,160 cycles # 3.051 GHz - 38,722,833,257 instructions # 2.58 insn per cycle - 4.913467832 seconds time elapsed +TOTAL : 4.959059 sec + 14,967,483,942 cycles # 3.016 GHz + 38,722,165,482 instructions # 2.59 insn per cycle + 4.965370367 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.768588e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.978375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.978375e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.669279e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.872934e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.872934e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.886514 sec - 8,955,670,287 cycles # 3.097 GHz - 24,428,197,131 instructions # 2.73 insn per cycle - 2.892532610 seconds time elapsed +TOTAL : 2.964577 sec + 8,950,291,023 cycles # 3.014 GHz + 24,429,249,082 instructions # 2.73 insn per cycle + 2.971122655 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.846816e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.353978e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.353978e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.730432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.235711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.235711e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.898303 sec - 5,531,164,928 cycles # 2.907 GHz - 11,561,301,371 instructions # 2.09 insn per cycle - 1.904555752 seconds time elapsed +TOTAL : 1.938370 sec + 5,533,844,555 cycles # 2.848 GHz + 11,561,296,052 instructions # 2.09 insn per cycle + 1.944753850 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.795208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.491804e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.491804e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.564439e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248886e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248886e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.648055 sec - 4,815,642,876 cycles # 2.913 GHz - 10,338,431,439 instructions # 2.15 insn per cycle - 1.654388782 seconds time elapsed +TOTAL : 1.706782 sec + 4,816,337,213 cycles # 2.813 GHz + 10,338,424,480 instructions # 2.15 insn per cycle + 1.713243341 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.486209e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.779482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.779482e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.268560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.543865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.543865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.442383 sec - 4,948,684,588 cycles # 2.023 GHz - 7,553,901,102 instructions # 1.53 insn per cycle - 2.448546666 seconds time elapsed +TOTAL : 2.565289 sec + 4,934,138,178 cycles # 1.920 GHz + 7,553,812,035 instructions # 1.53 insn per cycle + 2.571920195 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index bf7c906eee..a132a6cc9e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:04:20 +DATE: 2024-01-27_18:31:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.041566e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.145424e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281266e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565902e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.160035e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275649e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.533190 sec - 2,275,159,250 cycles # 2.956 GHz - 3,198,582,493 instructions # 1.41 insn per cycle - 0.846238143 seconds time elapsed +TOTAL : 0.524945 sec + 2,211,104,325 cycles # 2.917 GHz + 3,161,744,126 instructions # 1.43 insn per cycle + 0.831302328 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.250213e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.320060e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.320060e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.211402e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.278629e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278629e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.764896 sec - 14,707,028,108 cycles # 3.083 GHz - 39,543,998,084 instructions # 2.69 insn per cycle - 4.779287435 seconds time elapsed +TOTAL : 4.847715 sec + 14,695,905,192 cycles # 3.028 GHz + 39,546,427,226 instructions # 2.69 insn per cycle + 4.856366875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.824643e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.049537e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.049537e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.853804e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.080865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080865e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.847845 sec - 8,591,903,714 cycles # 3.011 GHz - 23,575,874,645 instructions # 2.74 insn per cycle - 2.868458782 seconds time elapsed +TOTAL : 2.826209 sec + 8,585,516,509 cycles # 3.032 GHz + 23,576,146,180 instructions # 2.75 insn per cycle + 2.839651136 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.388113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.817928e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.817928e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.280108e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704463e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704463e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.052030 sec - 5,980,210,912 cycles # 2.906 GHz - 13,193,706,510 instructions # 2.21 insn per cycle - 2.072456018 seconds time elapsed +TOTAL : 2.095729 sec + 5,966,190,094 cycles # 2.842 GHz + 13,193,303,338 instructions # 2.21 insn per cycle + 2.108681044 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.853930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.365435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.365435e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.635750e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.133552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.133552e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.897340 sec - 5,529,135,380 cycles # 2.906 GHz - 12,102,256,477 instructions # 2.19 insn per cycle - 1.916025222 seconds time elapsed +TOTAL : 1.968095 sec + 5,545,071,065 cycles # 2.809 GHz + 12,102,600,869 instructions # 2.18 insn per cycle + 1.994477360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.111951e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.359090e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.359090e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.941444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.175192e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.655651 sec - 5,371,905,800 cycles # 2.018 GHz - 9,380,836,259 instructions # 1.75 insn per cycle - 2.675871938 seconds time elapsed +TOTAL : 2.769335 sec + 5,356,977,226 cycles # 1.931 GHz + 9,382,516,259 instructions # 1.75 insn per cycle + 2.796011726 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index b9120fbb5d..fe3b97e60f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:43:11 +DATE: 2024-01-27_19:11:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.554592e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155746e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271514e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.556891e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156782e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272118e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.519357 sec - 2,253,363,966 cycles # 3.002 GHz - 3,205,544,962 instructions # 1.42 insn per cycle - 0.810112146 seconds time elapsed +TOTAL : 0.523274 sec + 2,225,190,296 cycles # 2.920 GHz + 3,165,313,508 instructions # 1.42 insn per cycle + 0.819371043 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.387914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.465509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.465509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.338474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.413557e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.413557e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.495702 sec - 13,904,869,897 cycles # 3.089 GHz - 35,848,783,358 instructions # 2.58 insn per cycle - 4.502182318 seconds time elapsed +TOTAL : 4.590948 sec + 13,903,491,071 cycles # 3.025 GHz + 35,849,286,374 instructions # 2.58 insn per cycle + 4.597844545 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.096318e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.348620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.348620e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.033937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.283303e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.283303e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.664466 sec - 8,202,926,102 cycles # 3.072 GHz - 21,906,089,636 instructions # 2.67 insn per cycle - 2.670916957 seconds time elapsed +TOTAL : 2.705956 sec + 8,203,204,697 cycles # 3.025 GHz + 21,906,275,135 instructions # 2.67 insn per cycle + 2.712679780 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.559059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.026865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.026865e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.611264e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.105261e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.105261e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.993840 sec - 5,528,651,014 cycles # 2.765 GHz - 12,074,880,280 instructions # 2.18 insn per cycle - 2.000267421 seconds time elapsed +TOTAL : 1.979671 sec + 5,534,366,412 cycles # 2.793 GHz + 12,076,831,406 instructions # 2.18 insn per cycle + 1.986665078 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.391303e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.001905e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.001905e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.207475e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.811915e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.811915e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.746221 sec - 5,115,562,208 cycles # 2.921 GHz - 11,141,356,804 instructions # 2.18 insn per cycle - 1.752659105 seconds time elapsed +TOTAL : 1.796434 sec + 5,140,536,788 cycles # 2.853 GHz + 11,141,735,276 instructions # 2.17 insn per cycle + 1.803318165 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.660691e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.975604e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.975604e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.467257e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.761466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.761466e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.354649 sec - 4,796,247,367 cycles # 2.033 GHz - 8,840,506,379 instructions # 1.84 insn per cycle - 2.361069291 seconds time elapsed +TOTAL : 2.454036 sec + 4,810,701,100 cycles # 1.956 GHz + 8,841,217,398 instructions # 1.84 insn per cycle + 2.460702395 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 78525f174f..a9f7002915 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:43:37 +DATE: 2024-01-27_19:11:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.552254e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157318e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273752e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.566602e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157557e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274796e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.522178 sec - 2,237,011,467 cycles # 2.943 GHz - 3,197,331,844 instructions # 1.43 insn per cycle - 0.817931849 seconds time elapsed +TOTAL : 0.526133 sec + 2,269,985,735 cycles # 2.938 GHz + 3,227,777,224 instructions # 1.42 insn per cycle + 0.830677919 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.662142e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.760003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.760003e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.558795e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.650149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.650149e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.044119 sec - 12,507,685,859 cycles # 3.089 GHz - 35,729,677,729 instructions # 2.86 insn per cycle - 4.050458069 seconds time elapsed +TOTAL : 4.204720 sec + 12,508,246,419 cycles # 2.972 GHz + 35,732,210,405 instructions # 2.86 insn per cycle + 4.211518798 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.223471e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.492032e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.492032e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.001813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.253552e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.253552e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.586847 sec - 8,038,523,807 cycles # 3.101 GHz - 21,259,861,765 instructions # 2.64 insn per cycle - 2.593509811 seconds time elapsed +TOTAL : 2.727801 sec + 8,032,446,977 cycles # 2.939 GHz + 21,259,935,359 instructions # 2.65 insn per cycle + 2.734713005 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.924128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.452749e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.452749e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.970376e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518278e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.876172 sec - 5,308,473,428 cycles # 2.821 GHz - 11,405,363,581 instructions # 2.15 insn per cycle - 1.882750968 seconds time elapsed +TOTAL : 1.863938 sec + 5,333,333,807 cycles # 2.853 GHz + 11,406,492,896 instructions # 2.14 insn per cycle + 1.870797127 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.605702e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.254707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.254707e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.390644e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.024648e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.024648e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.691812 sec - 4,996,879,417 cycles # 2.944 GHz - 10,598,375,872 instructions # 2.12 insn per cycle - 1.698475640 seconds time elapsed +TOTAL : 1.748196 sec + 4,995,883,688 cycles # 2.848 GHz + 10,598,736,895 instructions # 2.12 insn per cycle + 1.755165718 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.756890e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.090329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.090329e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.515233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.832010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.832010e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.313369 sec - 4,723,638,871 cycles # 2.039 GHz - 8,568,244,906 instructions # 1.81 insn per cycle - 2.319888543 seconds time elapsed +TOTAL : 2.434876 sec + 4,705,444,696 cycles # 1.931 GHz + 8,568,550,279 instructions # 1.82 insn per cycle + 2.441557167 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index b9b0cde3c0..f704509ce3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:04:47 +DATE: 2024-01-27_18:32:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.239567e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.582837e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.963220e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.371378e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.647289e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968239e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478940 sec - 2,112,985,286 cycles # 2.999 GHz - 2,978,090,529 instructions # 1.41 insn per cycle - 0.778737361 seconds time elapsed +TOTAL : 0.481545 sec + 2,062,266,198 cycles # 2.917 GHz + 2,949,886,970 instructions # 1.43 insn per cycle + 0.779421283 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.379568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.458926e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.458926e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.329952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.407106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.407106e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.490589 sec - 13,909,019,522 cycles # 3.094 GHz - 37,078,836,915 instructions # 2.67 insn per cycle - 4.499210329 seconds time elapsed +TOTAL : 4.585250 sec + 13,896,605,295 cycles # 3.027 GHz + 37,077,674,283 instructions # 2.67 insn per cycle + 4.593480894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.322182e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.773170e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.773170e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.194608e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.645421e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.645421e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.053265 sec - 6,163,905,946 cycles # 2.994 GHz - 15,211,835,383 instructions # 2.47 insn per cycle - 2.068387035 seconds time elapsed +TOTAL : 2.104955 sec + 6,163,876,669 cycles # 2.922 GHz + 15,212,935,053 instructions # 2.47 insn per cycle + 2.119735474 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.346993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.070180e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.070180e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.463230e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088896e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.088896e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.207020 sec - 3,445,603,929 cycles # 2.841 GHz - 7,715,625,143 instructions # 2.24 insn per cycle - 1.223685202 seconds time elapsed +TOTAL : 1.193160 sec + 3,445,241,392 cycles # 2.874 GHz + 7,715,704,867 instructions # 2.24 insn per cycle + 1.208683669 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.048691e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.225026e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.225026e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.031770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.205353e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.205353e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.085664 sec - 3,179,509,142 cycles # 2.914 GHz - 7,110,367,783 instructions # 2.24 insn per cycle - 1.099965100 seconds time elapsed +TOTAL : 1.101345 sec + 3,175,112,198 cycles # 2.868 GHz + 7,109,521,586 instructions # 2.24 insn per cycle + 1.114551636 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.789161e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.697174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.697174e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.234175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.048165e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.048165e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.430626 sec - 2,983,504,587 cycles # 2.078 GHz - 5,763,932,231 instructions # 1.93 insn per cycle - 1.444194360 seconds time elapsed +TOTAL : 1.535385 sec + 2,985,815,567 cycles # 1.938 GHz + 5,764,090,445 instructions # 1.93 insn per cycle + 1.547903228 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 6981bfe44c..6f10c4e596 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:53:54 +DATE: 2024-01-27_19:22:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.095576e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.509874e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.509874e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.965218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.420236e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.420236e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.662144 sec - 2,681,924,672 cycles # 3.020 GHz - 4,117,977,384 instructions # 1.54 insn per cycle - 0.946162262 seconds time elapsed +TOTAL : 0.672742 sec + 2,699,076,990 cycles # 2.952 GHz + 4,122,510,277 instructions # 1.53 insn per cycle + 0.972058483 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -86,14 +86,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.346899e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.425707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.425707e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.308184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.383926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.383926e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.594846 sec - 14,071,961,860 cycles # 3.059 GHz - 37,120,495,552 instructions # 2.64 insn per cycle - 4.601599482 seconds time elapsed +TOTAL : 4.671374 sec + 14,083,951,451 cycles # 3.011 GHz + 37,120,772,584 instructions # 2.64 insn per cycle + 4.679096629 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -114,14 +114,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.433551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.901399e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.901399e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.315538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.768348e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.768348e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.060355 sec - 6,363,040,481 cycles # 3.080 GHz - 15,491,502,720 instructions # 2.43 insn per cycle - 2.067385124 seconds time elapsed +TOTAL : 2.105430 sec + 6,354,554,621 cycles # 3.011 GHz + 15,492,019,285 instructions # 2.44 insn per cycle + 2.112526903 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -142,14 +142,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.463492e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.087540e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.087540e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.249872e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.061367e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061367e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.236762 sec - 3,641,436,941 cycles # 2.930 GHz - 7,953,430,490 instructions # 2.18 insn per cycle - 1.243962608 seconds time elapsed +TOTAL : 1.265664 sec + 3,645,033,176 cycles # 2.866 GHz + 7,953,463,690 instructions # 2.18 insn per cycle + 1.273185269 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -170,14 +170,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.030058e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.201172e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.201172e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.008124e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175608e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.175608e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.146983 sec - 3,383,857,761 cycles # 2.935 GHz - 7,347,779,825 instructions # 2.17 insn per cycle - 1.154126862 seconds time elapsed +TOTAL : 1.171580 sec + 3,374,627,804 cycles # 2.865 GHz + 7,347,172,592 instructions # 2.18 insn per cycle + 1.179048762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -198,14 +198,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.701950e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.600616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.600616e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.385113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.212336e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.212336e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.492167 sec - 3,194,827,132 cycles # 2.133 GHz - 6,021,969,878 instructions # 1.88 insn per cycle - 1.499129213 seconds time elapsed +TOTAL : 1.552093 sec + 3,187,803,861 cycles # 2.046 GHz + 6,021,486,201 instructions # 1.89 insn per cycle + 1.559571454 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index b4d637a0a9..14a879576f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-26_00:07:08 +DATE: 2024-01-27_19:35:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.419910e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.637559e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958244e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.393831e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.627983e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.948882e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.560146 sec - 2,374,838,659 cycles # 3.002 GHz - 3,427,021,490 instructions # 1.44 insn per cycle - 0.848554044 seconds time elapsed +TOTAL : 0.563905 sec + 2,307,075,319 cycles # 2.940 GHz + 3,388,511,891 instructions # 1.47 insn per cycle + 0.842336969 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.365915e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.444126e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.444126e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.310833e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.387688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.387688e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.568004 sec - 14,161,375,602 cycles # 3.097 GHz - 37,106,664,478 instructions # 2.62 insn per cycle - 4.574110970 seconds time elapsed +TOTAL : 4.679354 sec + 14,057,950,577 cycles # 3.001 GHz + 37,107,834,585 instructions # 2.64 insn per cycle + 4.685514940 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.461790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.938444e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.938444e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.376525e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.841873e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.841873e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 -TOTAL : 2.056520 sec - 6,336,772,154 cycles # 3.074 GHz - 15,224,230,200 instructions # 2.40 insn per cycle - 2.062537942 seconds time elapsed +TOTAL : 2.090400 sec + 6,325,242,283 cycles # 3.019 GHz + 15,223,298,660 instructions # 2.41 insn per cycle + 2.096577762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.613041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107028e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107028e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.372411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.078800e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.078800e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.230312 sec - 3,608,595,652 cycles # 2.921 GHz - 7,698,744,843 instructions # 2.13 insn per cycle - 1.236387039 seconds time elapsed +TOTAL : 1.259927 sec + 3,617,133,063 cycles # 2.859 GHz + 7,699,481,453 instructions # 2.13 insn per cycle + 1.266476529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.053770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.232215e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.232215e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.024833e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.196928e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.196928e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.133298 sec - 3,342,033,785 cycles # 2.936 GHz - 7,059,045,782 instructions # 2.11 insn per cycle - 1.139229993 seconds time elapsed +TOTAL : 1.164722 sec + 3,345,597,755 cycles # 2.860 GHz + 7,059,028,825 instructions # 2.11 insn per cycle + 1.171001818 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.847509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.768714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.768714e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.526851e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.397649e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.397649e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.474424 sec - 3,144,945,594 cycles # 2.126 GHz - 5,713,078,392 instructions # 1.82 insn per cycle - 1.480358562 seconds time elapsed +TOTAL : 1.535907 sec + 3,151,943,893 cycles # 2.047 GHz + 5,714,706,075 instructions # 1.81 insn per cycle + 1.542090891 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 55dee8f4ac..c7f0d3b000 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-26_00:03:47 +DATE: 2024-01-27_19:32:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.403572e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.634845e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.961526e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.419343e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634429e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.957416e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.503705 sec - 2,164,758,906 cycles # 2.993 GHz - 3,340,655,205 instructions # 1.54 insn per cycle - 0.781352769 seconds time elapsed +TOTAL : 0.510130 sec + 2,137,942,145 cycles # 2.934 GHz + 3,348,306,775 instructions # 1.57 insn per cycle + 0.788342368 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.387687e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.466623e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.466623e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.320811e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.398671e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.398671e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.477648 sec - 13,882,329,105 cycles # 3.098 GHz - 37,077,716,863 instructions # 2.67 insn per cycle - 4.483747727 seconds time elapsed +TOTAL : 4.604425 sec + 13,892,293,847 cycles # 3.014 GHz + 37,077,656,533 instructions # 2.67 insn per cycle + 4.610650729 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.497536e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.975863e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.975863e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.374425e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.841836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.841836e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.989964 sec - 6,164,501,349 cycles # 3.090 GHz - 15,212,436,330 instructions # 2.47 insn per cycle - 1.996006252 seconds time elapsed +TOTAL : 2.035974 sec + 6,175,209,269 cycles # 3.027 GHz + 15,211,533,732 instructions # 2.46 insn per cycle + 2.042362317 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.422535e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.084358e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.084358e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.409429e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.081523e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.081523e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.197986 sec - 3,445,911,992 cycles # 2.865 GHz - 7,714,670,946 instructions # 2.24 insn per cycle - 1.203790737 seconds time elapsed +TOTAL : 1.200285 sec + 3,444,552,271 cycles # 2.857 GHz + 7,714,694,129 instructions # 2.24 insn per cycle + 1.206466922 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.050826e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.231385e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.231385e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.031458e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204817e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.204817e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.081579 sec - 3,176,085,947 cycles # 2.924 GHz - 7,108,143,250 instructions # 2.24 insn per cycle - 1.087328908 seconds time elapsed +TOTAL : 1.101185 sec + 3,173,622,936 cycles # 2.869 GHz + 7,108,508,585 instructions # 2.24 insn per cycle + 1.107582757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.813979e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.730921e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.730921e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.603863e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.500334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.500334e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.425356 sec - 2,984,093,084 cycles # 2.087 GHz - 5,763,176,682 instructions # 1.93 insn per cycle - 1.431417075 seconds time elapsed +TOTAL : 1.465047 sec + 2,989,253,718 cycles # 2.034 GHz + 5,763,096,085 instructions # 1.93 insn per cycle + 1.471313886 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 3920589722..efbd8ab8d1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-26_00:00:31 +DATE: 2024-01-27_19:29:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.971445e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.633028e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951519e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.782075e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635958e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962133e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.605308 sec - 2,490,356,687 cycles # 3.020 GHz - 3,879,865,044 instructions # 1.56 insn per cycle - 0.883165296 seconds time elapsed +TOTAL : 0.616379 sec + 2,457,237,369 cycles # 2.941 GHz + 3,790,082,335 instructions # 1.54 insn per cycle + 0.895160860 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -79,14 +79,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.371842e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450351e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450351e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.321377e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.398534e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.398534e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.505383 sec - 13,893,504,068 cycles # 3.081 GHz - 37,078,758,053 instructions # 2.67 insn per cycle - 4.511346462 seconds time elapsed +TOTAL : 4.603077 sec + 13,891,133,529 cycles # 3.015 GHz + 37,077,670,179 instructions # 2.67 insn per cycle + 4.609272116 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe @@ -106,14 +106,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.454645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.940983e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.940983e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.362616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.828214e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.828214e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.007427 sec - 6,155,242,843 cycles # 3.058 GHz - 15,210,679,421 instructions # 2.47 insn per cycle - 2.013794020 seconds time elapsed +TOTAL : 2.039889 sec + 6,158,523,818 cycles # 3.012 GHz + 15,211,006,796 instructions # 2.47 insn per cycle + 2.046105279 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe @@ -133,14 +133,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.721991e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.117822e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.117822e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.276982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.065857e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.065857e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.161483 sec - 3,441,215,224 cycles # 2.950 GHz - 7,714,556,442 instructions # 2.24 insn per cycle - 1.167386391 seconds time elapsed +TOTAL : 1.216623 sec + 3,449,112,422 cycles # 2.826 GHz + 7,715,526,788 instructions # 2.24 insn per cycle + 1.222716671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.065655e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245864e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245864e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.011316e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180119e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180119e+06 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.066868 sec - 3,168,142,767 cycles # 2.956 GHz - 7,108,351,170 instructions # 2.24 insn per cycle - 1.072831056 seconds time elapsed +TOTAL : 1.123119 sec + 3,188,964,800 cycles # 2.827 GHz + 7,109,363,220 instructions # 2.23 insn per cycle + 1.129107814 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe @@ -187,14 +187,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.818298e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.735774e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.735774e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.275437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.094175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.094175e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.424324 sec - 2,976,782,899 cycles # 2.083 GHz - 5,762,338,734 instructions # 1.94 insn per cycle - 1.430396985 seconds time elapsed +TOTAL : 1.528710 sec + 2,977,974,799 cycles # 1.942 GHz + 5,762,709,816 instructions # 1.94 insn per cycle + 1.534960548 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 43203bd7a4..b3cfffed0a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:05:11 +DATE: 2024-01-27_18:32:34 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.282862e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.624392e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.015642e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.432312e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.695963e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.031521e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480823 sec - 2,102,693,968 cycles # 2.972 GHz - 2,999,740,623 instructions # 1.43 insn per cycle - 0.778832349 seconds time elapsed +TOTAL : 0.479202 sec + 2,063,006,228 cycles # 2.917 GHz + 2,917,318,985 instructions # 1.41 insn per cycle + 0.776497427 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.386604e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.466058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.466058e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.339008e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.417276e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.417276e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.477679 sec - 13,798,927,191 cycles # 3.078 GHz - 37,479,360,958 instructions # 2.72 insn per cycle - 4.487052345 seconds time elapsed +TOTAL : 4.568333 sec + 13,810,912,026 cycles # 3.020 GHz + 37,479,722,319 instructions # 2.71 insn per cycle + 4.576499144 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.180823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.797776e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.797776e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.912563e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.481847e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.481847e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.779733 sec - 5,483,084,854 cycles # 3.071 GHz - 15,244,773,755 instructions # 2.78 insn per cycle - 1.794293430 seconds time elapsed +TOTAL : 1.857535 sec + 5,469,534,249 cycles # 2.935 GHz + 15,245,119,579 instructions # 2.79 insn per cycle + 1.874458797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.879556e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.587809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.587809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.695444e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.379016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.379016e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.608086 sec - 4,709,912,951 cycles # 2.918 GHz - 9,849,751,420 instructions # 2.09 insn per cycle - 1.622779833 seconds time elapsed +TOTAL : 1.651281 sec + 4,710,630,291 cycles # 2.843 GHz + 9,850,049,828 instructions # 2.09 insn per cycle + 1.667735017 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.215676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.011410e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.011410e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.060982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.844113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.844113e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.537749 sec - 4,488,857,938 cycles # 2.908 GHz - 9,201,595,147 instructions # 2.05 insn per cycle - 1.552559223 seconds time elapsed +TOTAL : 1.570550 sec + 4,488,154,546 cycles # 2.847 GHz + 9,201,957,327 instructions # 2.05 insn per cycle + 1.584001028 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.585161e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.226147e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.226147e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.363474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.970097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.970097e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.676187 sec - 3,452,216,154 cycles # 2.052 GHz - 6,874,474,606 instructions # 1.99 insn per cycle - 1.693691111 seconds time elapsed +TOTAL : 1.734074 sec + 3,450,778,166 cycles # 1.984 GHz + 6,874,943,117 instructions # 1.99 insn per cycle + 1.747504108 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index f4ee00fe61..d13cada649 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:44:02 +DATE: 2024-01-27_19:12:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.367806e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.645974e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.965500e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.400890e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.630288e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.950964e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.472161 sec - 2,109,738,277 cycles # 3.018 GHz - 2,952,634,070 instructions # 1.40 insn per cycle - 0.756674168 seconds time elapsed +TOTAL : 0.479613 sec + 2,055,675,343 cycles # 2.919 GHz + 2,963,853,527 instructions # 1.44 insn per cycle + 0.762433330 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.689089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.789055e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.789055e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.584030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.679206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.679206e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.983379 sec - 12,403,709,488 cycles # 3.110 GHz - 34,216,718,460 instructions # 2.76 insn per cycle - 3.989754240 seconds time elapsed +TOTAL : 4.144569 sec + 12,413,620,811 cycles # 2.992 GHz + 34,216,317,298 instructions # 2.76 insn per cycle + 4.151106330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.382450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.036660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.036660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.211764e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.840756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.840756e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.727265 sec - 5,355,354,456 cycles # 3.092 GHz - 14,586,803,552 instructions # 2.72 insn per cycle - 1.733316914 seconds time elapsed +TOTAL : 1.773327 sec + 5,361,205,653 cycles # 3.014 GHz + 14,586,960,101 instructions # 2.72 insn per cycle + 1.779952229 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.057917e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.061528e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.061528e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.860060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.831807e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.831807e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.385709 sec - 4,060,564,696 cycles # 2.919 GHz - 9,087,730,757 instructions # 2.24 insn per cycle - 1.391973829 seconds time elapsed +TOTAL : 1.420044 sec + 4,064,581,884 cycles # 2.853 GHz + 9,088,361,630 instructions # 2.24 insn per cycle + 1.426753997 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.660623e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.845013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.845013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.459229e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.605357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.605357e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.294701 sec - 3,804,324,661 cycles # 2.927 GHz - 8,440,322,737 instructions # 2.22 insn per cycle - 1.301108243 seconds time elapsed +TOTAL : 1.325444 sec + 3,806,668,537 cycles # 2.860 GHz + 8,440,473,835 instructions # 2.22 insn per cycle + 1.332061166 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.034946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.574893e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.574893e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.862576e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.375949e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.375949e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.822240 sec - 3,731,963,828 cycles # 2.043 GHz - 7,571,756,363 instructions # 2.03 insn per cycle - 1.828579723 seconds time elapsed +TOTAL : 1.874251 sec + 3,732,915,412 cycles # 1.986 GHz + 7,571,540,074 instructions # 2.03 insn per cycle + 1.881121521 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 4e1115173b..f03dfd549c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:44:27 +DATE: 2024-01-27_19:12:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.456016e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.710724e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.042438e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.450874e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.693366e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019706e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480867 sec - 2,014,288,944 cycles # 2.853 GHz - 2,823,952,881 instructions # 1.40 insn per cycle - 0.764919690 seconds time elapsed +TOTAL : 0.479144 sec + 2,065,328,354 cycles # 2.934 GHz + 2,937,355,449 instructions # 1.42 insn per cycle + 0.762525562 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -77,14 +77,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.800465e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.909661e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.909661e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.656538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.759877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.759877e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.827874 sec - 11,937,612,158 cycles # 3.115 GHz - 35,406,088,073 instructions # 2.97 insn per cycle - 3.834430542 seconds time elapsed +TOTAL : 4.035436 sec + 11,950,269,072 cycles # 2.958 GHz + 35,406,289,389 instructions # 2.96 insn per cycle + 4.042104786 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.762722e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.495247e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.495247e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.593818e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.308688e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.308688e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.633478 sec - 5,067,221,535 cycles # 3.092 GHz - 14,044,637,271 instructions # 2.77 insn per cycle - 1.639618429 seconds time elapsed +TOTAL : 1.677045 sec + 5,074,113,057 cycles # 3.016 GHz + 14,044,617,920 instructions # 2.77 insn per cycle + 1.683531971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.247761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.298779e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.298779e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.984398e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.987290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.987290e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.353889 sec - 3,995,635,648 cycles # 2.940 GHz - 8,628,877,597 instructions # 2.16 insn per cycle - 1.360027429 seconds time elapsed +TOTAL : 1.399748 sec + 4,006,669,403 cycles # 2.852 GHz + 8,630,101,532 instructions # 2.15 insn per cycle + 1.406553471 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.956843e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.022194e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.022194e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.705281e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.911157e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.911157e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.253159 sec - 3,697,308,146 cycles # 2.938 GHz - 8,101,240,276 instructions # 2.19 insn per cycle - 1.259582611 seconds time elapsed +TOTAL : 1.292210 sec + 3,693,940,962 cycles # 2.847 GHz + 8,100,406,211 instructions # 2.19 insn per cycle + 1.298750765 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.976795e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.519153e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.519153e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.984579e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.516367e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.516367e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.839809 sec - 3,582,305,411 cycles # 1.942 GHz - 7,373,472,849 instructions # 2.06 insn per cycle - 1.845929573 seconds time elapsed +TOTAL : 1.839178 sec + 3,581,181,804 cycles # 1.942 GHz + 7,373,471,148 instructions # 2.06 insn per cycle + 1.845549534 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 62e73e56f9..f40e579459 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:05:35 +DATE: 2024-01-27_18:32:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.042238e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.143081e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277579e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.534144e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154422e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270192e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529983 sec - 2,284,372,825 cycles # 2.981 GHz - 3,253,251,346 instructions # 1.42 insn per cycle - 0.837040452 seconds time elapsed +TOTAL : 0.530968 sec + 2,216,247,658 cycles # 2.886 GHz + 3,159,617,774 instructions # 1.43 insn per cycle + 0.841223938 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.177107e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.241482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.241482e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.131579e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.194174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.194174e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.922213 sec - 15,202,220,905 cycles # 3.086 GHz - 39,294,318,145 instructions # 2.58 insn per cycle - 4.931576421 seconds time elapsed +TOTAL : 5.026343 sec + 15,226,407,193 cycles # 3.026 GHz + 39,292,878,837 instructions # 2.58 insn per cycle + 5.034989309 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.753886e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.970015e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.970015e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.716282e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.924245e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.924245e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.899375 sec - 8,841,932,542 cycles # 3.043 GHz - 24,092,972,952 instructions # 2.72 insn per cycle - 2.913268497 seconds time elapsed +TOTAL : 2.928277 sec + 8,842,003,385 cycles # 3.013 GHz + 24,093,000,203 instructions # 2.72 insn per cycle + 2.944933034 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.715445e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.218183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.218183e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.756859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.271313e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.271313e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.941407 sec - 5,504,561,337 cycles # 2.827 GHz - 11,449,312,759 instructions # 2.08 insn per cycle - 1.955081745 seconds time elapsed +TOTAL : 1.927658 sec + 5,479,999,914 cycles # 2.834 GHz + 11,449,005,560 instructions # 2.09 insn per cycle + 1.943367878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.882817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.597475e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.597475e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.680868e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.379667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.379667e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.628926 sec - 4,780,392,865 cycles # 2.924 GHz - 10,317,148,509 instructions # 2.16 insn per cycle - 1.646633592 seconds time elapsed +TOTAL : 1.676141 sec + 4,795,916,206 cycles # 2.851 GHz + 10,317,620,181 instructions # 2.15 insn per cycle + 1.692913460 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.601015e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.909270e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.909270e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.384618e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.665609e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.665609e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.383696 sec - 4,853,543,919 cycles # 2.031 GHz - 7,366,655,182 instructions # 1.52 insn per cycle - 2.397229409 seconds time elapsed +TOTAL : 2.498959 sec + 4,846,181,062 cycles # 1.935 GHz + 7,366,355,573 instructions # 1.52 insn per cycle + 2.513668299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index dbc018b658..d579c4f0fa 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-25_23:06:02 +DATE: 2024-01-27_18:33:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.027996e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136763e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273770e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.549566e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157441e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275364e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529349 sec - 2,275,344,800 cycles # 2.980 GHz - 3,194,805,823 instructions # 1.40 insn per cycle - 0.834615249 seconds time elapsed +TOTAL : 0.529385 sec + 2,259,213,903 cycles # 2.923 GHz + 3,197,943,416 instructions # 1.42 insn per cycle + 0.839811434 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -77,14 +77,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.197645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.262543e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.262543e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.148050e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.211542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.211542e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.876786 sec - 15,083,175,183 cycles # 3.091 GHz - 40,115,300,077 instructions # 2.66 insn per cycle - 4.885756664 seconds time elapsed +TOTAL : 4.988633 sec + 15,083,182,010 cycles # 3.021 GHz + 40,116,301,091 instructions # 2.66 insn per cycle + 4.997454221 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe @@ -104,14 +104,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.836140e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.061195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.061195e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.806171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.025471e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.025471e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.843009 sec - 8,673,476,327 cycles # 3.047 GHz - 23,533,368,418 instructions # 2.71 insn per cycle - 2.936629019 seconds time elapsed +TOTAL : 2.859908 sec + 8,680,023,988 cycles # 3.031 GHz + 23,533,588,214 instructions # 2.71 insn per cycle + 2.872184850 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe @@ -131,14 +131,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.262735e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.669813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.669813e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.076921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.467987e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.467987e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.096332 sec - 6,163,295,525 cycles # 2.932 GHz - 13,102,492,963 instructions # 2.13 insn per cycle - 2.114571383 seconds time elapsed +TOTAL : 2.172334 sec + 6,181,256,235 cycles # 2.838 GHz + 13,103,124,082 instructions # 2.12 insn per cycle + 2.189317684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe @@ -158,14 +158,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.546010e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.002876e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.002876e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.497425e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.959043e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.959043e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.996809 sec - 5,749,993,886 cycles # 2.871 GHz - 12,209,923,535 instructions # 2.12 insn per cycle - 2.011417030 seconds time elapsed +TOTAL : 2.013879 sec + 5,750,603,441 cycles # 2.847 GHz + 12,210,900,339 instructions # 2.12 insn per cycle + 2.027278919 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe @@ -185,14 +185,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.204857e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.459679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.459679e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.050220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.288891e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.288891e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.599675 sec - 5,257,659,489 cycles # 2.018 GHz - 8,448,298,827 instructions # 1.61 insn per cycle - 2.614773289 seconds time elapsed +TOTAL : 2.695806 sec + 5,252,673,350 cycles # 1.944 GHz + 8,448,932,017 instructions # 1.61 insn per cycle + 2.713329625 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index fa8caa938e..7a5cc5c1da 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:06:29 +DATE: 2024-01-27_18:33:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.554064e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.054520e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071264e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.710988e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058136e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.465030 sec - 2,048,991,704 cycles # 2.999 GHz - 2,945,132,376 instructions # 1.44 insn per cycle - 0.753228173 seconds time elapsed +TOTAL : 0.465052 sec + 2,015,443,684 cycles # 2.917 GHz + 2,859,484,489 instructions # 1.42 insn per cycle + 0.769131816 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.042508e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.318801e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.335462e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.082561e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.322561e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336982e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.608387 sec - 2,548,936,715 cycles # 3.003 GHz - 3,858,101,121 instructions # 1.51 insn per cycle - 0.909984749 seconds time elapsed +TOTAL : 0.605622 sec + 2,468,411,647 cycles # 2.919 GHz + 3,739,161,994 instructions # 1.51 insn per cycle + 0.904488043 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.617017e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.629887e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.629887e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.537137e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.550087e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.550087e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.283295 sec - 19,499,856,239 cycles # 3.102 GHz - 57,919,782,342 instructions # 2.97 insn per cycle - 6.290250953 seconds time elapsed +TOTAL : 6.481814 sec + 19,510,102,704 cycles # 3.008 GHz + 57,920,963,917 instructions # 2.97 insn per cycle + 6.498270794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.909036e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.954610e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.954610e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.882301e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.929920e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.929920e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.360323 sec - 10,199,661,444 cycles # 3.037 GHz - 29,947,464,065 instructions # 2.94 insn per cycle - 3.374239300 seconds time elapsed +TOTAL : 3.379513 sec + 10,206,931,044 cycles # 3.016 GHz + 29,943,639,229 instructions # 2.93 insn per cycle + 3.392509806 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.833419e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.001345e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.001345e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.552526e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.739093e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.739093e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.688645 sec - 4,909,126,113 cycles # 2.900 GHz - 11,211,204,678 instructions # 2.28 insn per cycle - 1.701281092 seconds time elapsed +TOTAL : 1.738955 sec + 4,925,861,013 cycles # 2.825 GHz + 11,211,258,499 instructions # 2.28 insn per cycle + 1.750398674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.121742e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.145009e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.145009e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.094289e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.118334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.118334e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.483031 sec - 4,297,587,774 cycles # 2.889 GHz - 10,187,383,914 instructions # 2.37 insn per cycle - 1.498495737 seconds time elapsed +TOTAL : 1.520671 sec + 4,316,683,210 cycles # 2.832 GHz + 10,188,546,360 instructions # 2.36 insn per cycle + 1.533592671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.229760e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.353025e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.353025e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.872940e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.000632e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.000632e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.015215 sec - 3,913,493,676 cycles # 1.938 GHz - 5,708,818,352 instructions # 1.46 insn per cycle - 2.028873157 seconds time elapsed +TOTAL : 2.107587 sec + 3,916,109,587 cycles # 1.854 GHz + 5,709,747,998 instructions # 1.46 insn per cycle + 2.121465703 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 096dd99876..26e40e50e6 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:54:18 +DATE: 2024-01-27_19:22:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.637551e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.758962e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758962e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.570600e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.729427e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.729427e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.489837 sec - 2,121,332,082 cycles # 3.002 GHz - 3,184,466,633 instructions # 1.50 insn per cycle - 0.765192095 seconds time elapsed +TOTAL : 0.497423 sec + 2,072,640,631 cycles # 2.921 GHz + 3,121,041,261 instructions # 1.51 insn per cycle + 0.767886427 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.708039e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.519148e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.519148e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.687406e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.440630e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.440630e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.826215 sec - 3,226,959,594 cycles # 2.979 GHz - 5,155,103,256 instructions # 1.60 insn per cycle - 1.143617843 seconds time elapsed +TOTAL : 0.831609 sec + 3,177,887,444 cycles # 2.935 GHz + 5,073,989,018 instructions # 1.60 insn per cycle + 1.143844413 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.596028e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.608949e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.608949e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.536676e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.549688e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.549688e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.338921 sec - 19,540,873,038 cycles # 3.081 GHz - 57,925,079,271 instructions # 2.96 insn per cycle - 6.344190271 seconds time elapsed +TOTAL : 6.489507 sec + 19,599,964,283 cycles # 3.018 GHz + 57,925,734,957 instructions # 2.96 insn per cycle + 6.495090644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.992522e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.039048e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.039048e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.771475e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.819214e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.819214e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.310489 sec - 10,222,193,867 cycles # 3.084 GHz - 29,992,757,087 instructions # 2.93 insn per cycle - 3.315734573 seconds time elapsed +TOTAL : 3.464872 sec + 10,246,076,982 cycles # 2.953 GHz + 29,991,327,277 instructions # 2.93 insn per cycle + 3.470312959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.748259e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.928510e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.928510e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.481732e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.670422e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.670422e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.709995 sec - 4,943,239,952 cycles # 2.885 GHz - 11,259,850,029 instructions # 2.28 insn per cycle - 1.714904879 seconds time elapsed +TOTAL : 1.761036 sec + 4,976,542,493 cycles # 2.821 GHz + 11,262,030,802 instructions # 2.26 insn per cycle + 1.766514699 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.066367e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090143e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090143e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.087761e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112018e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.568862 sec - 4,349,115,858 cycles # 2.765 GHz - 10,237,621,087 instructions # 2.35 insn per cycle - 1.574339542 seconds time elapsed +TOTAL : 1.537342 sec + 4,353,684,327 cycles # 2.825 GHz + 10,235,838,456 instructions # 2.35 insn per cycle + 1.542582778 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.164835e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.291624e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.291624e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.820472e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.946042e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.946042e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.037518 sec - 3,945,267,341 cycles # 1.933 GHz - 5,747,633,925 instructions # 1.46 insn per cycle - 2.042653419 seconds time elapsed +TOTAL : 2.130009 sec + 3,961,323,367 cycles # 1.856 GHz + 5,747,731,832 instructions # 1.45 insn per cycle + 2.135393810 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 25c18afcf9..82f8f0c137 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:06:58 +DATE: 2024-01-27_18:34:24 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.433352e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.037630e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.053848e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.648982e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.034609e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048969e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.463370 sec - 2,015,424,656 cycles # 2.959 GHz - 2,929,966,501 instructions # 1.45 insn per cycle - 0.751673536 seconds time elapsed +TOTAL : 0.469943 sec + 1,939,557,622 cycles # 2.824 GHz + 2,799,432,511 instructions # 1.44 insn per cycle + 0.759940517 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.036681e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.308916e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325873e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.073086e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309469e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323399e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603072 sec - 2,515,534,303 cycles # 2.981 GHz - 3,732,014,009 instructions # 1.48 insn per cycle - 0.904884829 seconds time elapsed +TOTAL : 0.603676 sec + 2,470,216,480 cycles # 2.932 GHz + 3,659,633,330 instructions # 1.48 insn per cycle + 0.901834911 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.611397e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.624145e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.624145e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.558200e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571366e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.297061 sec - 19,501,945,017 cycles # 3.096 GHz - 57,747,982,567 instructions # 2.96 insn per cycle - 6.304284821 seconds time elapsed +TOTAL : 6.427967 sec + 19,465,588,306 cycles # 3.027 GHz + 57,749,577,513 instructions # 2.97 insn per cycle + 6.435332564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.963950e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.009246e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.009246e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.855358e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.902213e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.902213e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.323112 sec - 10,252,760,514 cycles # 3.082 GHz - 30,333,563,638 instructions # 2.96 insn per cycle - 3.334682355 seconds time elapsed +TOTAL : 3.397645 sec + 10,269,894,933 cycles # 3.019 GHz + 30,334,003,333 instructions # 2.95 insn per cycle + 3.409134445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.543659e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.713856e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.713856e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.024480e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.196412e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.196412e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.739150 sec - 5,054,703,175 cycles # 2.901 GHz - 11,664,593,433 instructions # 2.31 insn per cycle - 1.752926203 seconds time elapsed +TOTAL : 1.839038 sec + 5,070,636,702 cycles # 2.750 GHz + 11,664,223,400 instructions # 2.30 insn per cycle + 1.852089106 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.038420e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058455e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.018318e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039419e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.039419e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.600620 sec - 4,609,440,056 cycles # 2.871 GHz - 10,805,674,859 instructions # 2.34 insn per cycle - 1.612255696 seconds time elapsed +TOTAL : 1.632266 sec + 4,623,527,919 cycles # 2.824 GHz + 10,805,823,321 instructions # 2.34 insn per cycle + 1.647095967 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.858559e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.976491e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.976491e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.787268e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.911321e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.911321e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.109622 sec - 3,949,478,547 cycles # 1.868 GHz - 5,998,434,731 instructions # 1.52 insn per cycle - 2.131507197 seconds time elapsed +TOTAL : 2.129389 sec + 3,963,647,711 cycles # 1.858 GHz + 5,999,337,334 instructions # 1.51 insn per cycle + 2.148549810 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 0b74a76420..dbb3bf021d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:07:28 +DATE: 2024-01-27_18:34:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.397686e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.380368e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494164e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.479786e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.361442e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.464103e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.443485 sec - 1,988,977,095 cycles # 2.997 GHz - 2,794,054,594 instructions # 1.40 insn per cycle - 0.738229663 seconds time elapsed +TOTAL : 0.448422 sec + 1,932,993,574 cycles # 2.916 GHz + 2,719,457,311 instructions # 1.41 insn per cycle + 0.741279046 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.039325e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.377418e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.475776e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.217603e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.410482e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.500434e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.495765 sec - 2,163,616,138 cycles # 2.985 GHz - 3,081,644,651 instructions # 1.42 insn per cycle - 0.782078599 seconds time elapsed +TOTAL : 0.496288 sec + 2,128,992,550 cycles # 2.942 GHz + 3,069,969,108 instructions # 1.44 insn per cycle + 0.782011398 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.800506e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.815684e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.815684e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.704875e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.719869e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719869e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 5.871499 sec - 18,162,391,724 cycles # 3.091 GHz - 55,237,317,005 instructions # 3.04 insn per cycle - 5.878377310 seconds time elapsed +TOTAL : 6.082847 sec + 18,191,693,462 cycles # 2.990 GHz + 55,241,232,857 instructions # 3.04 insn per cycle + 6.090107876 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.026560e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.187110e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.187110e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.792245e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.953433e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.953433e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.835796 sec - 5,681,014,790 cycles # 3.088 GHz - 16,127,858,668 instructions # 2.84 insn per cycle - 1.846414018 seconds time elapsed +TOTAL : 1.884910 sec + 5,695,859,688 cycles # 3.015 GHz + 16,129,136,780 instructions # 2.83 insn per cycle + 1.899358612 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.894717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.962920e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.962920e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.841113e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.909434e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.909434e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.885139 sec - 2,584,367,657 cycles # 2.906 GHz - 6,085,782,007 instructions # 2.35 insn per cycle - 0.899021976 seconds time elapsed +TOTAL : 0.911679 sec + 2,595,428,617 cycles # 2.832 GHz + 6,086,094,913 instructions # 2.34 insn per cycle + 0.926601135 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.135210e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.222813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.222813e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.076614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.164725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.164725e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.788022 sec - 2,287,899,951 cycles # 2.888 GHz - 5,552,661,648 instructions # 2.43 insn per cycle - 0.799868307 seconds time elapsed +TOTAL : 0.810804 sec + 2,297,569,115 cycles # 2.817 GHz + 5,552,672,846 instructions # 2.42 insn per cycle + 0.829591110 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.622321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.674923e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.674923e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.552666e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.602558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.602558e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.032955 sec - 2,017,528,772 cycles # 1.944 GHz - 3,285,966,058 instructions # 1.63 insn per cycle - 1.048446410 seconds time elapsed +TOTAL : 1.079184 sec + 2,022,265,688 cycles # 1.866 GHz + 3,286,565,352 instructions # 1.63 insn per cycle + 1.092911477 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index dc0596d15b..9e8745f87b 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:54:47 +DATE: 2024-01-27_19:23:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.062167e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079545e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.079545e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.995078e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.153881e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.153881e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.452726 sec - 1,977,358,341 cycles # 2.985 GHz - 2,923,970,350 instructions # 1.48 insn per cycle - 0.720026423 seconds time elapsed +TOTAL : 0.457490 sec + 1,962,472,775 cycles # 2.931 GHz + 2,879,319,483 instructions # 1.47 insn per cycle + 0.727241404 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.811088e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.588064e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.588064e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.577440e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.549481e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.549481e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.632525 sec - 2,597,542,173 cycles # 3.015 GHz - 3,997,383,655 instructions # 1.54 insn per cycle - 0.921189224 seconds time elapsed +TOTAL : 0.647454 sec + 2,501,078,986 cycles # 2.832 GHz + 3,834,912,584 instructions # 1.53 insn per cycle + 0.940994508 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.787761e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.803517e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.803517e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.735414e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.750975e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.750975e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 5.900987 sec - 18,196,746,451 cycles # 3.082 GHz - 55,241,449,113 instructions # 3.04 insn per cycle - 5.905790765 seconds time elapsed +TOTAL : 6.016121 sec + 18,199,864,686 cycles # 3.023 GHz + 55,241,753,508 instructions # 3.04 insn per cycle + 6.021157634 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.748340e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.909538e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.909538e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.783564e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.948787e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.948787e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.898315 sec - 5,708,476,501 cycles # 3.002 GHz - 16,175,955,794 instructions # 2.83 insn per cycle - 1.903472906 seconds time elapsed +TOTAL : 1.891930 sec + 5,721,918,716 cycles # 3.018 GHz + 16,176,155,673 instructions # 2.83 insn per cycle + 1.897169792 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.884738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.952981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.952981e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.824815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.894169e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.894169e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.893745 sec - 2,602,515,487 cycles # 2.899 GHz - 6,121,386,159 instructions # 2.35 insn per cycle - 0.898505248 seconds time elapsed +TOTAL : 0.924503 sec + 2,621,240,795 cycles # 2.822 GHz + 6,121,586,144 instructions # 2.34 insn per cycle + 0.929727468 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.141965e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.229819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.229819e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.077557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.168993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.168993e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.789242 sec - 2,305,479,442 cycles # 2.907 GHz - 5,588,939,300 instructions # 2.42 insn per cycle - 0.794159548 seconds time elapsed +TOTAL : 0.815038 sec + 2,326,750,327 cycles # 2.840 GHz + 5,589,464,486 instructions # 2.40 insn per cycle + 0.820459860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.633816e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685769e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685769e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.563585e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.614955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.614955e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.030236 sec - 2,031,108,987 cycles # 1.966 GHz - 3,327,076,453 instructions # 1.64 insn per cycle - 1.034960601 seconds time elapsed +TOTAL : 1.075997 sec + 2,047,720,481 cycles # 1.896 GHz + 3,327,332,623 instructions # 1.62 insn per cycle + 1.081428956 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 71738afd73..10091edee9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:07:52 +DATE: 2024-01-27_18:35:19 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.345447e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.296112e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.402388e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.400668e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.203981e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294099e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.448536 sec - 1,925,765,115 cycles # 2.904 GHz - 2,794,221,799 instructions # 1.45 insn per cycle - 0.739665941 seconds time elapsed +TOTAL : 0.449645 sec + 1,928,359,102 cycles # 2.901 GHz + 2,715,522,194 instructions # 1.41 insn per cycle + 0.732501667 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.063360e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.429159e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.528806e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.220081e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.406699e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.488271e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.496316 sec - 2,170,960,632 cycles # 2.997 GHz - 3,070,673,937 instructions # 1.41 insn per cycle - 0.782694833 seconds time elapsed +TOTAL : 0.495964 sec + 2,117,808,383 cycles # 2.926 GHz + 3,044,007,413 instructions # 1.44 insn per cycle + 0.782283779 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.784835e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.799936e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.799936e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.698857e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.713684e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.713684e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 5.904353 sec - 18,127,930,758 cycles # 3.068 GHz - 54,990,203,516 instructions # 3.03 insn per cycle - 5.911564510 seconds time elapsed +TOTAL : 6.092382 sec + 18,143,037,567 cycles # 2.976 GHz + 54,990,857,470 instructions # 3.03 insn per cycle + 6.099315799 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.265549e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.433505e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.433505e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.027267e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.200862e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.200862e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.788383 sec - 5,530,075,500 cycles # 3.085 GHz - 16,222,672,133 instructions # 2.93 insn per cycle - 1.800758208 seconds time elapsed +TOTAL : 1.835907 sec + 5,542,210,713 cycles # 3.011 GHz + 16,222,683,207 instructions # 2.93 insn per cycle + 1.848191835 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.641224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.691486e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.691486e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.590968e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.642236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.642236e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.018368 sec - 2,972,289,589 cycles # 2.906 GHz - 6,708,049,819 instructions # 2.26 insn per cycle - 1.031379068 seconds time elapsed +TOTAL : 1.051037 sec + 2,983,610,010 cycles # 2.826 GHz + 6,707,954,290 instructions # 2.25 insn per cycle + 1.063017203 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.801339e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.862952e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.862952e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.752738e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814984e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814984e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.930483 sec - 2,705,842,308 cycles # 2.895 GHz - 6,222,888,546 instructions # 2.30 insn per cycle - 0.943738964 seconds time elapsed +TOTAL : 0.956581 sec + 2,712,532,749 cycles # 2.822 GHz + 6,222,719,641 instructions # 2.29 insn per cycle + 0.972410805 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.521231e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565310e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565310e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.465208e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.508714e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.508714e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.099011 sec - 2,151,243,651 cycles # 1.950 GHz - 3,642,085,891 instructions # 1.69 insn per cycle - 1.111518081 seconds time elapsed +TOTAL : 1.141622 sec + 2,158,724,317 cycles # 1.883 GHz + 3,642,405,179 instructions # 1.69 insn per cycle + 1.154915737 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 763ed418db..af01fc4dc2 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:08:17 +DATE: 2024-01-27_18:35:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.452662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040665e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056694e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.670889e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.038424e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052798e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.465162 sec - 2,037,507,117 cycles # 2.995 GHz - 2,884,960,920 instructions # 1.42 insn per cycle - 0.752471370 seconds time elapsed +TOTAL : 0.464949 sec + 2,022,736,340 cycles # 2.936 GHz + 2,867,853,283 instructions # 1.42 insn per cycle + 0.765717373 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.039801e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309901e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.326601e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.075008e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.311358e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325319e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.609431 sec - 2,555,228,465 cycles # 3.005 GHz - 3,858,150,049 instructions # 1.51 insn per cycle - 0.910318180 seconds time elapsed +TOTAL : 0.613502 sec + 2,484,545,734 cycles # 2.920 GHz + 3,754,293,361 instructions # 1.51 insn per cycle + 0.912823393 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.552046e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.564457e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.564457e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.455745e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.468100e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.468100e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.443119 sec - 19,943,249,308 cycles # 3.094 GHz - 59,159,929,401 instructions # 2.97 insn per cycle - 6.450554622 seconds time elapsed +TOTAL : 6.696763 sec + 19,991,114,547 cycles # 2.984 GHz + 59,158,816,657 instructions # 2.96 insn per cycle + 6.703698004 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.970824e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.018053e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.018053e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.903362e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.951737e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.951737e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.318511 sec - 10,090,639,333 cycles # 3.042 GHz - 29,766,389,850 instructions # 2.95 insn per cycle - 3.349269539 seconds time elapsed +TOTAL : 3.365217 sec + 10,110,454,300 cycles # 3.000 GHz + 29,763,982,937 instructions # 2.94 insn per cycle + 3.380160698 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.888460e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.006978e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.006978e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.586442e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.775935e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.775935e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.679372 sec - 4,872,134,956 cycles # 2.894 GHz - 11,200,375,088 instructions # 2.30 insn per cycle - 1.692620147 seconds time elapsed +TOTAL : 1.732951 sec + 4,886,607,168 cycles # 2.812 GHz + 11,200,637,765 instructions # 2.29 insn per cycle + 1.744637675 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.144061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.168103e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.168103e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136544e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136544e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.455389 sec - 4,227,854,530 cycles # 2.898 GHz - 10,145,979,716 instructions # 2.40 insn per cycle - 1.472361150 seconds time elapsed +TOTAL : 1.500096 sec + 4,242,297,740 cycles # 2.822 GHz + 10,146,067,744 instructions # 2.39 insn per cycle + 1.512418724 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.055174e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.172416e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.172416e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.683269e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.802496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.802496e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.058389 sec - 3,995,721,738 cycles # 1.937 GHz - 5,838,323,837 instructions # 1.46 insn per cycle - 2.074873076 seconds time elapsed +TOTAL : 2.157833 sec + 4,010,657,164 cycles # 1.855 GHz + 5,838,670,469 instructions # 1.46 insn per cycle + 2.171817278 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index d3b29c0fa7..0a96473f8d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-25_23:08:46 +DATE: 2024-01-27_18:36:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.426102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039267e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056306e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.706357e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039489e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.053162e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462722 sec - 2,075,753,099 cycles # 3.015 GHz - 2,960,136,278 instructions # 1.43 insn per cycle - 0.765500799 seconds time elapsed +TOTAL : 0.466869 sec + 2,024,358,514 cycles # 2.916 GHz + 2,905,232,876 instructions # 1.44 insn per cycle + 0.763744307 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.031468e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.301962e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318830e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070573e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.320777e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.601863 sec - 2,564,202,637 cycles # 3.038 GHz - 3,829,571,666 instructions # 1.49 insn per cycle - 0.903658002 seconds time elapsed +TOTAL : 0.604637 sec + 2,476,191,893 cycles # 2.932 GHz + 3,698,353,250 instructions # 1.49 insn per cycle + 0.904013775 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.538356e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.550809e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.550809e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.506662e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.519241e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.519241e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.478032 sec - 19,734,523,883 cycles # 3.045 GHz - 58,710,623,892 instructions # 2.98 insn per cycle - 6.485434165 seconds time elapsed +TOTAL : 6.560428 sec + 19,794,820,917 cycles # 3.016 GHz + 58,706,436,643 instructions # 2.97 insn per cycle + 6.568655504 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.902025e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.948991e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.948991e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.814384e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.862259e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.862259e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.364695 sec - 10,117,380,544 cycles # 3.006 GHz - 30,160,733,453 instructions # 2.98 insn per cycle - 3.381394083 seconds time elapsed +TOTAL : 3.426889 sec + 10,132,275,681 cycles # 2.953 GHz + 30,161,186,267 instructions # 2.98 insn per cycle + 3.442826711 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.506132e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.674497e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.674497e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.985614e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.154939e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.154939e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.745813 sec - 5,023,045,767 cycles # 2.870 GHz - 11,663,258,678 instructions # 2.32 insn per cycle - 1.757870215 seconds time elapsed +TOTAL : 1.847199 sec + 5,039,612,145 cycles # 2.722 GHz + 11,663,713,946 instructions # 2.31 insn per cycle + 1.860008244 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.061888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.082266e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.082266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.036649e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.058621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058621e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.564942 sec - 4,539,640,714 cycles # 2.894 GHz - 10,787,225,686 instructions # 2.38 insn per cycle - 1.577778818 seconds time elapsed +TOTAL : 1.604013 sec + 4,559,317,270 cycles # 2.834 GHz + 10,788,553,583 instructions # 2.37 insn per cycle + 1.619683290 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.937035e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.053909e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.053909e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.650902e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.770775e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.770775e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.088743 sec - 4,049,955,561 cycles # 1.935 GHz - 6,072,727,098 instructions # 1.50 insn per cycle - 2.103091451 seconds time elapsed +TOTAL : 2.167007 sec + 4,062,674,429 cycles # 1.871 GHz + 6,072,986,165 instructions # 1.49 insn per cycle + 2.180347983 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 1b975999f0..8748ec80e8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:09:15 +DATE: 2024-01-27_18:36:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.469665e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.503353e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.505822e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.464977e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496338e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.525454 sec - 2,304,049,489 cycles # 3.008 GHz - 3,585,827,144 instructions # 1.56 insn per cycle - 0.838808541 seconds time elapsed +TOTAL : 0.528543 sec + 2,270,184,097 cycles # 2.943 GHz + 3,425,872,249 instructions # 1.51 insn per cycle + 0.844253845 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.130028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.173120e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.124349e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.158647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.160156e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.036213 sec - 10,025,778,991 cycles # 3.041 GHz - 20,025,933,583 instructions # 2.00 insn per cycle - 3.353883863 seconds time elapsed +TOTAL : 3.035919 sec + 9,807,716,592 cycles # 2.977 GHz + 21,076,351,164 instructions # 2.15 insn per cycle + 3.350689954 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.903450e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904314e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904314e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.874609e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.875505e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.875505e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.626486 sec - 26,438,113,457 cycles # 3.066 GHz - 81,756,024,237 instructions # 3.09 insn per cycle - 8.633880208 seconds time elapsed +TOTAL : 8.759617 sec + 26,426,534,581 cycles # 3.016 GHz + 81,752,523,728 instructions # 3.09 insn per cycle + 8.766841308 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.782602e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.786067e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.786067e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.754471e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.757856e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.757856e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.345078 sec - 12,949,115,165 cycles # 2.978 GHz - 39,243,323,450 instructions # 3.03 insn per cycle - 4.361137024 seconds time elapsed +TOTAL : 4.380696 sec + 12,879,948,356 cycles # 2.938 GHz + 39,242,098,075 instructions # 3.05 insn per cycle + 4.396802628 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.583787e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.601455e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.601455e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.416832e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.434424e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.434424e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.920616 sec - 5,564,908,521 cycles # 2.891 GHz - 13,789,734,974 instructions # 2.48 insn per cycle - 1.936475869 seconds time elapsed +TOTAL : 1.958579 sec + 5,557,464,463 cycles # 2.831 GHz + 13,789,270,718 instructions # 2.48 insn per cycle + 1.973620354 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.734883e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.757336e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.757336e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.535315e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.557539e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.557539e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.694453 sec - 4,890,498,518 cycles # 2.879 GHz - 12,318,491,698 instructions # 2.52 insn per cycle - 1.710599834 seconds time elapsed +TOTAL : 1.730276 sec + 4,897,094,506 cycles # 2.823 GHz + 12,318,633,778 instructions # 2.52 insn per cycle + 1.748142491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.797734e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.812832e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.812832e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.350731e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.364351e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.364351e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.113063 sec - 4,049,846,106 cycles # 1.913 GHz - 6,286,795,753 instructions # 1.55 insn per cycle - 2.127195558 seconds time elapsed +TOTAL : 2.242637 sec + 4,057,109,602 cycles # 1.806 GHz + 6,286,690,411 instructions # 1.55 insn per cycle + 2.253700963 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 27f7e20ca2..fd89ab8868 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:55:47 +DATE: 2024-01-27_19:24:17 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.145488e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.497122e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497122e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.109503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.454645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.454645e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.509145 sec - 2,222,116,978 cycles # 3.014 GHz - 3,541,651,897 instructions # 1.59 insn per cycle - 0.797289210 seconds time elapsed +TOTAL : 0.516589 sec + 2,186,982,677 cycles # 2.925 GHz + 3,445,591,612 instructions # 1.58 insn per cycle + 0.807789349 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.636792e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.102264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.102264e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.604275e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.097164e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.097164e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.311683 sec - 10,993,373,303 cycles # 3.071 GHz - 22,632,188,932 instructions # 2.06 insn per cycle - 3.639416589 seconds time elapsed +TOTAL : 3.331099 sec + 10,692,447,995 cycles # 2.965 GHz + 23,347,460,347 instructions # 2.18 insn per cycle + 3.661893165 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.907265e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.908117e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.908117e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.878015e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.878871e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.878871e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.612943 sec - 26,477,322,279 cycles # 3.073 GHz - 81,758,457,566 instructions # 3.09 insn per cycle - 8.618373177 seconds time elapsed +TOTAL : 8.747004 sec + 26,454,331,369 cycles # 3.023 GHz + 81,758,459,626 instructions # 3.09 insn per cycle + 8.752513277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.736047e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.739396e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.739396e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.775861e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.779543e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779543e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.404587 sec - 12,949,218,628 cycles # 2.939 GHz - 39,253,017,453 instructions # 3.03 insn per cycle - 4.409630473 seconds time elapsed +TOTAL : 4.357884 sec + 12,894,661,106 cycles # 2.956 GHz + 39,253,566,522 instructions # 3.04 insn per cycle + 4.363419681 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.544040e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.561904e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.561904e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.412414e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.430210e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.430210e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.933255 sec - 5,589,610,637 cycles # 2.885 GHz - 13,799,041,950 instructions # 2.47 insn per cycle - 1.938578158 seconds time elapsed +TOTAL : 1.963936 sec + 5,573,295,220 cycles # 2.832 GHz + 13,799,743,853 instructions # 2.48 insn per cycle + 1.969915875 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.759915e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.783050e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.783050e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.515821e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.539804e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.539804e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.694288 sec - 4,905,478,806 cycles # 2.889 GHz - 12,328,602,147 instructions # 2.51 insn per cycle - 1.699386329 seconds time elapsed +TOTAL : 1.737334 sec + 4,912,657,678 cycles # 2.821 GHz + 12,328,208,012 instructions # 2.51 insn per cycle + 1.742657060 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.754851e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.770279e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.770279e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.174374e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.188369e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.188369e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.128740 sec - 4,078,758,332 cycles # 1.912 GHz - 6,296,587,445 instructions # 1.54 insn per cycle - 2.133849808 seconds time elapsed +TOTAL : 2.300281 sec + 4,068,031,162 cycles # 1.766 GHz + 6,296,983,262 instructions # 1.55 insn per cycle + 2.305872630 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index cd95d96bbf..ba3e5a6d39 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-26_00:07:32 +DATE: 2024-01-27_19:36:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.492316e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521271e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.523609e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.502781e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.530729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.533274e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.507003 sec - 2,206,717,561 cycles # 3.001 GHz - 3,448,725,330 instructions # 1.56 insn per cycle - 0.796732185 seconds time elapsed +TOTAL : 0.511281 sec + 2,189,845,313 cycles # 2.932 GHz + 3,420,225,928 instructions # 1.56 insn per cycle + 0.809530054 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.143192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177337e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178848e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.147307e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.183262e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.127410 sec - 10,316,808,786 cycles # 3.054 GHz - 22,423,079,541 instructions # 2.17 insn per cycle - 3.438326791 seconds time elapsed +TOTAL : 3.129728 sec + 10,090,248,576 cycles # 2.983 GHz + 21,741,294,339 instructions # 2.15 insn per cycle + 3.440954058 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.906516e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907436e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907436e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.865476e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.866367e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.866367e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.613443 sec - 26,433,384,953 cycles # 3.068 GHz - 81,751,741,374 instructions # 3.09 insn per cycle - 8.618094360 seconds time elapsed +TOTAL : 8.805569 sec + 26,448,557,229 cycles # 3.003 GHz + 81,752,712,519 instructions # 3.09 insn per cycle + 8.810814165 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.802273e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.805901e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.805901e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.674042e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.677471e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.677471e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.325072 sec - 12,937,290,700 cycles # 2.989 GHz - 39,240,011,387 instructions # 3.03 insn per cycle - 4.329794435 seconds time elapsed +TOTAL : 4.475227 sec + 12,907,401,540 cycles # 2.882 GHz + 39,240,721,411 instructions # 3.04 insn per cycle + 4.480266384 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.589458e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.608319e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.608319e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.337116e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.355239e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.355239e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.920532 sec - 5,571,487,226 cycles # 2.896 GHz - 13,788,001,887 instructions # 2.47 insn per cycle - 1.925207623 seconds time elapsed +TOTAL : 1.978873 sec + 5,565,688,850 cycles # 2.807 GHz + 13,787,538,268 instructions # 2.48 insn per cycle + 1.983941242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.699010e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.723517e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.723517e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.506300e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.530161e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.530161e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.701918 sec - 4,894,734,524 cycles # 2.870 GHz - 12,315,493,193 instructions # 2.52 insn per cycle - 1.706709534 seconds time elapsed +TOTAL : 1.736969 sec + 4,905,247,441 cycles # 2.818 GHz + 12,316,173,805 instructions # 2.51 insn per cycle + 1.741977898 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.766135e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.780590e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.780590e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.394634e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.409308e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.409308e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.122969 sec - 4,050,408,054 cycles # 1.905 GHz - 6,283,219,779 instructions # 1.55 insn per cycle - 2.127795422 seconds time elapsed +TOTAL : 2.230744 sec + 4,063,689,523 cycles # 1.819 GHz + 6,283,713,565 instructions # 1.55 insn per cycle + 2.235926812 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index e8c507cdee..4fc77c5f3b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-26_00:04:11 +DATE: 2024-01-27_19:32:51 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.478378e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.505566e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.508071e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.474359e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.503184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505889e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.504525 sec - 2,215,801,883 cycles # 2.980 GHz - 3,455,759,890 instructions # 1.56 insn per cycle - 0.808082050 seconds time elapsed +TOTAL : 0.506872 sec + 2,190,591,729 cycles # 2.928 GHz + 3,363,193,614 instructions # 1.54 insn per cycle + 0.813143037 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.137368e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.173067e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.142083e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.177917e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.063157 sec - 10,144,716,887 cycles # 3.058 GHz - 21,058,700,369 instructions # 2.08 insn per cycle - 3.374375541 seconds time elapsed +TOTAL : 3.068584 sec + 9,750,888,418 cycles # 2.933 GHz + 22,498,978,506 instructions # 2.31 insn per cycle + 3.380971345 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.912677e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.913545e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.913545e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.865848e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.866726e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.866726e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.585286 sec - 26,459,308,315 cycles # 3.081 GHz - 81,751,492,157 instructions # 3.09 insn per cycle - 8.590088729 seconds time elapsed +TOTAL : 8.799928 sec + 26,485,507,873 cycles # 3.009 GHz + 81,753,130,978 instructions # 3.09 insn per cycle + 8.805031066 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.797442e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.801061e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.801061e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.783072e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786617e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786617e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.329041 sec - 12,934,718,239 cycles # 2.986 GHz - 39,240,868,450 instructions # 3.03 insn per cycle - 4.333889790 seconds time elapsed +TOTAL : 4.345759 sec + 12,887,960,701 cycles # 2.964 GHz + 39,240,971,185 instructions # 3.04 insn per cycle + 4.350680718 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.580291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.598151e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.598151e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.401098e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.418172e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.418172e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.920867 sec - 5,567,876,605 cycles # 2.893 GHz - 13,788,026,591 instructions # 2.48 insn per cycle - 1.925509830 seconds time elapsed +TOTAL : 1.962097 sec + 5,557,008,460 cycles # 2.826 GHz + 13,788,303,853 instructions # 2.48 insn per cycle + 1.967376308 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.717286e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.740439e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.740439e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.166638e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.188094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.188094e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.697305 sec - 4,890,738,669 cycles # 2.875 GHz - 12,317,303,230 instructions # 2.52 insn per cycle - 1.702130241 seconds time elapsed +TOTAL : 1.799007 sec + 5,088,916,448 cycles # 2.823 GHz + 12,318,256,629 instructions # 2.42 insn per cycle + 1.804299971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.800846e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.815791e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.815791e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.442905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.457256e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.457256e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.112333 sec - 4,045,690,393 cycles # 1.912 GHz - 6,285,044,805 instructions # 1.55 insn per cycle - 2.116928860 seconds time elapsed +TOTAL : 2.213655 sec + 4,060,302,969 cycles # 1.831 GHz + 6,285,302,844 instructions # 1.55 insn per cycle + 2.218845333 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 52fdd67f88..1d0a9ae11e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-26_00:00:54 +DATE: 2024-01-27_19:29:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.217958e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.512930e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515214e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.200930e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.503716e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.506421e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.506889 sec - 2,233,510,708 cycles # 3.038 GHz - 3,515,215,151 instructions # 1.57 insn per cycle - 0.795257948 seconds time elapsed +TOTAL : 0.511242 sec + 2,174,984,659 cycles # 2.945 GHz + 3,418,296,130 instructions # 1.57 insn per cycle + 0.801197579 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.741390e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.169861e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.171330e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.739270e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178124e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.194603 sec - 10,409,856,467 cycles # 3.018 GHz - 23,237,283,726 instructions # 2.23 insn per cycle - 3.506487413 seconds time elapsed +TOTAL : 3.204802 sec + 10,389,625,524 cycles # 3.004 GHz + 21,552,317,551 instructions # 2.07 insn per cycle + 3.515594954 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,14 +94,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.920097e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.920994e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.920994e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.881683e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.882597e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.882597e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.551552 sec - 26,447,790,271 cycles # 3.092 GHz - 81,753,854,285 instructions # 3.09 insn per cycle - 8.556393713 seconds time elapsed +TOTAL : 8.725568 sec + 26,442,204,272 cycles # 3.029 GHz + 81,754,472,166 instructions # 3.09 insn per cycle + 8.731630799 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.752850e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.756405e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.756405e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.719790e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.723385e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723385e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.380066 sec - 12,930,382,850 cycles # 2.950 GHz - 39,242,708,718 instructions # 3.03 insn per cycle - 4.384871242 seconds time elapsed +TOTAL : 4.419535 sec + 12,903,853,279 cycles # 2.917 GHz + 39,240,792,929 instructions # 3.04 insn per cycle + 4.424473414 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.576151e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.593998e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.593998e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.390147e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.408683e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.408683e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.922112 sec - 5,560,568,053 cycles # 2.887 GHz - 13,787,937,924 instructions # 2.48 insn per cycle - 1.926945796 seconds time elapsed +TOTAL : 1.965025 sec + 5,556,583,212 cycles # 2.822 GHz + 13,788,397,685 instructions # 2.48 insn per cycle + 1.970292377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.812534e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.836405e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.836405e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.343307e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.365779e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.365779e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.681289 sec - 4,888,148,053 cycles # 2.901 GHz - 12,317,061,785 instructions # 2.52 insn per cycle - 1.685971625 seconds time elapsed +TOTAL : 1.765569 sec + 4,897,889,150 cycles # 2.769 GHz + 12,318,083,328 instructions # 2.51 insn per cycle + 1.770720541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe @@ -202,14 +202,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.789889e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.805147e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.805147e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.419663e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.434340e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.434340e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.115068 sec - 4,046,960,669 cycles # 1.910 GHz - 6,285,467,635 instructions # 1.55 insn per cycle - 2.119996341 seconds time elapsed +TOTAL : 2.220750 sec + 4,055,797,181 cycles # 1.823 GHz + 6,285,553,359 instructions # 1.55 insn per cycle + 2.225912970 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 4f02b865b7..6ec4da2ebd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:09:52 +DATE: 2024-01-27_18:37:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.471206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.507351e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.443994e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.472121e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.475106e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.523241 sec - 2,294,977,837 cycles # 2.990 GHz - 3,355,752,268 instructions # 1.46 insn per cycle - 0.839300855 seconds time elapsed +TOTAL : 0.529621 sec + 2,179,867,827 cycles # 2.817 GHz + 3,377,723,181 instructions # 1.55 insn per cycle + 0.844194090 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.146612e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.187782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.189484e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.133949e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.168641e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.170113e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.024342 sec - 10,116,132,856 cycles # 3.082 GHz - 21,831,790,705 instructions # 2.16 insn per cycle - 3.341530087 seconds time elapsed +TOTAL : 3.040771 sec + 9,449,850,956 cycles # 2.864 GHz + 21,228,312,642 instructions # 2.25 insn per cycle + 3.355844186 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.904926e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905851e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905851e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.871145e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.872027e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.872027e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.620332 sec - 26,463,781,685 cycles # 3.069 GHz - 81,778,380,367 instructions # 3.09 insn per cycle - 8.627766180 seconds time elapsed +TOTAL : 8.775522 sec + 26,467,836,514 cycles # 3.015 GHz + 81,778,710,330 instructions # 3.09 insn per cycle + 8.783029262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.778600e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.781982e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.781982e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.712170e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.715556e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.715556e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.350007 sec - 12,914,528,246 cycles # 2.966 GHz - 39,248,322,763 instructions # 3.04 insn per cycle - 4.364307061 seconds time elapsed +TOTAL : 4.427464 sec + 12,911,058,764 cycles # 2.914 GHz + 39,248,548,650 instructions # 3.04 insn per cycle + 4.443782704 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.584212e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.601630e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.601630e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.404510e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.421856e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.421856e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.921217 sec - 5,553,864,604 cycles # 2.887 GHz - 13,804,830,516 instructions # 2.49 insn per cycle - 1.935543419 seconds time elapsed +TOTAL : 1.961012 sec + 5,550,780,310 cycles # 2.824 GHz + 13,804,627,273 instructions # 2.49 insn per cycle + 1.973888367 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.656517e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.679828e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.679828e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.427377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.450136e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.450136e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.708041 sec - 4,906,794,461 cycles # 2.867 GHz - 12,330,114,831 instructions # 2.51 insn per cycle - 1.722730362 seconds time elapsed +TOTAL : 1.749655 sec + 4,882,694,724 cycles # 2.783 GHz + 12,329,545,304 instructions # 2.53 insn per cycle + 1.763737429 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.785044e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.799232e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.799232e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.530178e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.544294e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.544294e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.116475 sec - 4,044,956,882 cycles # 1.907 GHz - 6,292,510,947 instructions # 1.56 insn per cycle - 2.132272886 seconds time elapsed +TOTAL : 2.188783 sec + 4,047,765,395 cycles # 1.846 GHz + 6,292,916,815 instructions # 1.55 insn per cycle + 2.202791681 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index a61b82479f..719bba46fb 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:44:49 +DATE: 2024-01-27_19:13:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.222012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.248217e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.222007e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.246519e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.249113e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530629 sec - 2,302,571,140 cycles # 3.026 GHz - 3,539,966,337 instructions # 1.54 insn per cycle - 0.818475146 seconds time elapsed +TOTAL : 0.532665 sec + 2,258,388,500 cycles # 2.939 GHz + 3,394,739,067 instructions # 1.50 insn per cycle + 0.825914000 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.766260e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.794821e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.795979e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.769378e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.798106e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.799313e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.302210 sec - 10,996,851,827 cycles # 3.096 GHz - 24,989,138,728 instructions # 2.27 insn per cycle - 3.612162016 seconds time elapsed +TOTAL : 3.307089 sec + 10,656,119,044 cycles # 2.993 GHz + 24,479,944,936 instructions # 2.30 insn per cycle + 3.619320255 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.459096e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.459599e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.459599e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.367139e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.367639e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.367639e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.792656 sec - 113,070,573,577 cycles # 3.074 GHz - 141,524,951,428 instructions # 1.25 insn per cycle - 36.797608137 seconds time elapsed +TOTAL : 37.563805 sec + 113,035,005,873 cycles # 3.009 GHz + 141,513,387,724 instructions # 1.25 insn per cycle + 37.569095568 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.314974e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.317691e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.317691e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.215278e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.217898e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.217898e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.957393 sec - 14,921,471,789 cycles # 3.008 GHz - 37,531,537,779 instructions # 2.52 insn per cycle - 4.962426661 seconds time elapsed +TOTAL : 5.110819 sec + 14,991,773,237 cycles # 2.931 GHz + 37,532,307,514 instructions # 2.50 insn per cycle + 5.116400936 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.938204e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.954119e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.954119e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.673563e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.688591e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.688591e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.076037 sec - 6,032,491,286 cycles # 2.901 GHz - 12,947,967,770 instructions # 2.15 insn per cycle - 2.080892058 seconds time elapsed +TOTAL : 2.147516 sec + 6,037,066,805 cycles # 2.806 GHz + 12,947,337,518 instructions # 2.14 insn per cycle + 2.152800643 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.541807e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.563199e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.563199e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.216341e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.238951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.238951e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.728667 sec - 5,007,253,350 cycles # 2.890 GHz - 11,363,228,557 instructions # 2.27 insn per cycle - 1.733486835 seconds time elapsed +TOTAL : 1.789912 sec + 5,003,070,633 cycles # 2.789 GHz + 11,363,542,829 instructions # 2.27 insn per cycle + 1.795109385 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.060598e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.076855e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.076855e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.733648e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.748517e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.748517e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.044567 sec - 3,894,310,656 cycles # 1.901 GHz - 5,853,526,554 instructions # 1.50 insn per cycle - 2.049442336 seconds time elapsed +TOTAL : 2.131221 sec + 3,906,829,492 cycles # 1.830 GHz + 5,854,010,142 instructions # 1.50 insn per cycle + 2.136494313 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 3f81898e30..513e439bd0 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:45:57 +DATE: 2024-01-27_19:14:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.225970e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.250939e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.252804e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.223781e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.250560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.253073e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527705 sec - 2,271,360,893 cycles # 3.011 GHz - 3,483,071,698 instructions # 1.53 insn per cycle - 0.811309280 seconds time elapsed +TOTAL : 0.534925 sec + 2,229,964,752 cycles # 2.924 GHz + 3,438,029,140 instructions # 1.54 insn per cycle + 0.823050876 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.787575e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.816403e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.817588e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.793172e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.822087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823326e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.294681 sec - 10,793,582,886 cycles # 3.053 GHz - 24,009,551,599 instructions # 2.22 insn per cycle - 3.606172358 seconds time elapsed +TOTAL : 3.282737 sec + 10,537,359,774 cycles # 2.978 GHz + 22,975,518,499 instructions # 2.18 insn per cycle + 3.596154660 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.434610e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.435103e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.435103e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.315697e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.316172e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.316172e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 36.995548 sec - 113,902,374,231 cycles # 3.079 GHz - 141,701,166,601 instructions # 1.24 insn per cycle - 37.000316422 seconds time elapsed +TOTAL : 38.011955 sec + 114,307,951,290 cycles # 3.007 GHz + 141,699,174,548 instructions # 1.24 insn per cycle + 38.017244555 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.309979e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.312621e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.312621e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.178101e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.180722e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.180722e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.964493 sec - 14,906,936,554 cycles # 3.001 GHz - 37,593,495,183 instructions # 2.52 insn per cycle - 4.969424279 seconds time elapsed +TOTAL : 5.170239 sec + 14,906,390,957 cycles # 2.881 GHz + 37,595,371,004 instructions # 2.52 insn per cycle + 5.175326172 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.063731e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.079641e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.079641e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.835494e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.851184e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.851184e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.043547 sec - 5,936,293,527 cycles # 2.899 GHz - 12,831,131,379 instructions # 2.16 insn per cycle - 2.048529346 seconds time elapsed +TOTAL : 2.106511 sec + 5,931,759,794 cycles # 2.813 GHz + 12,831,713,311 instructions # 2.16 insn per cycle + 2.111663205 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.611190e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.634726e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.634726e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.356622e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.379915e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.379915e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.715814 sec - 4,981,290,818 cycles # 2.897 GHz - 11,359,270,596 instructions # 2.28 insn per cycle - 1.720629844 seconds time elapsed +TOTAL : 1.763132 sec + 4,989,376,638 cycles # 2.823 GHz + 11,359,472,763 instructions # 2.28 insn per cycle + 1.768435525 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.987877e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.004071e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.004071e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.751418e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.767332e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.767332e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.062896 sec - 3,892,536,672 cycles # 1.884 GHz - 5,843,276,595 instructions # 1.50 insn per cycle - 2.067877562 seconds time elapsed +TOTAL : 2.126080 sec + 3,898,010,783 cycles # 1.830 GHz + 5,843,135,780 instructions # 1.50 insn per cycle + 2.131353524 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 169ba41d04..05c77ff41e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:10:29 +DATE: 2024-01-27_18:37:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.321241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.384306e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.390636e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.317460e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.368942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.376232e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.478569 sec - 2,081,351,055 cycles # 2.995 GHz - 3,104,629,593 instructions # 1.49 insn per cycle - 0.781758350 seconds time elapsed +TOTAL : 0.485177 sec + 2,060,153,436 cycles # 2.906 GHz + 3,002,003,985 instructions # 1.46 insn per cycle + 0.795438508 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.504629e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.593370e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.597094e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.516597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.590115e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.593568e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.724357 sec - 6,014,180,976 cycles # 3.072 GHz - 12,790,134,988 instructions # 2.13 insn per cycle - 2.014643886 seconds time elapsed +TOTAL : 1.727811 sec + 5,842,248,421 cycles # 2.980 GHz + 11,578,097,409 instructions # 1.98 insn per cycle + 2.017768230 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.098989e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.100057e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.100057e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.052118e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.053165e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.053165e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.822875 sec - 24,208,205,325 cycles # 3.093 GHz - 75,876,008,404 instructions # 3.13 insn per cycle - 7.829655847 seconds time elapsed +TOTAL : 8.003896 sec + 24,216,644,613 cycles # 3.025 GHz + 75,876,682,144 instructions # 3.13 insn per cycle + 8.010960986 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.608318e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.622050e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.622050e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.403824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.417702e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.417702e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.165057 sec - 6,486,423,740 cycles # 2.992 GHz - 20,116,074,983 instructions # 3.10 insn per cycle - 2.179423883 seconds time elapsed +TOTAL : 2.224003 sec + 6,486,236,444 cycles # 2.911 GHz + 20,115,514,239 instructions # 3.10 insn per cycle + 2.239579450 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.707797e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.714735e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.714735e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.590551e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.597032e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.597032e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.968913 sec - 2,827,321,383 cycles # 2.906 GHz - 7,038,162,946 instructions # 2.49 insn per cycle - 0.982725511 seconds time elapsed +TOTAL : 1.040535 sec + 2,821,593,769 cycles # 2.701 GHz + 7,038,300,357 instructions # 2.49 insn per cycle + 1.056476360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.948915e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.957872e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.957872e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.875158e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.883769e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883769e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.850636 sec - 2,475,963,051 cycles # 2.899 GHz - 6,280,012,775 instructions # 2.54 insn per cycle - 0.866712833 seconds time elapsed +TOTAL : 0.883644 sec + 2,479,108,367 cycles # 2.791 GHz + 6,280,326,617 instructions # 2.53 insn per cycle + 0.896889444 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.572704e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.578550e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.578550e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.414115e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.419249e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.419249e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.053199 sec - 2,034,931,877 cycles # 1.927 GHz - 3,248,490,915 instructions # 1.60 insn per cycle - 1.063051290 seconds time elapsed +TOTAL : 1.168827 sec + 2,036,775,596 cycles # 1.736 GHz + 3,248,885,824 instructions # 1.60 insn per cycle + 1.182247107 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 4a07905533..ec70d3a329 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:56:24 +DATE: 2024-01-27_19:24:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.650204e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.344236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.344236e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.606611e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.289869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.289869e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.464619 sec - 2,032,299,547 cycles # 3.010 GHz - 3,040,740,721 instructions # 1.50 insn per cycle - 0.733214185 seconds time elapsed +TOTAL : 0.470000 sec + 2,011,017,737 cycles # 2.923 GHz + 2,948,669,744 instructions # 1.47 insn per cycle + 0.746672929 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.245584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.458669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.458669e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.237746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.479275e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.479275e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.895523 sec - 6,515,109,485 cycles # 3.057 GHz - 13,866,028,547 instructions # 2.13 insn per cycle - 2.190470399 seconds time elapsed +TOTAL : 1.902902 sec + 6,321,067,258 cycles # 2.957 GHz + 12,132,072,792 instructions # 1.92 insn per cycle + 2.198373975 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.080299e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.081327e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.081327e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.050984e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.052020e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.052020e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.894694 sec - 24,207,279,883 cycles # 3.068 GHz - 75,883,942,743 instructions # 3.13 insn per cycle - 7.899865359 seconds time elapsed +TOTAL : 8.008133 sec + 24,231,473,098 cycles # 3.025 GHz + 75,883,434,879 instructions # 3.13 insn per cycle + 8.013586367 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.558107e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.572924e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.572924e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.414640e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.429427e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.429427e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.181134 sec - 6,508,175,197 cycles # 2.979 GHz - 20,123,398,381 instructions # 3.09 insn per cycle - 2.186113667 seconds time elapsed +TOTAL : 2.223220 sec + 6,514,807,099 cycles # 2.925 GHz + 20,124,720,070 instructions # 3.09 insn per cycle + 2.228499201 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.689000e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.696260e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.696260e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.662841e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.670032e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.670032e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.981991 sec - 2,821,624,681 cycles # 2.862 GHz - 7,046,855,732 instructions # 2.50 insn per cycle - 0.986872343 seconds time elapsed +TOTAL : 0.997374 sec + 2,830,956,139 cycles # 2.827 GHz + 7,047,558,426 instructions # 2.49 insn per cycle + 1.002648676 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.866719e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.875603e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.875603e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.894795e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.904037e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.904037e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.889374 sec - 2,585,713,262 cycles # 2.895 GHz - 6,289,546,575 instructions # 2.43 insn per cycle - 0.894321875 seconds time elapsed +TOTAL : 0.876913 sec + 2,489,066,136 cycles # 2.825 GHz + 6,289,333,997 instructions # 2.53 insn per cycle + 0.882075649 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.551570e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.557637e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.557637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.500045e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.505997e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.505997e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.068223 sec - 2,044,230,196 cycles # 1.906 GHz - 3,257,615,420 instructions # 1.59 insn per cycle - 1.073232409 seconds time elapsed +TOTAL : 1.107321 sec + 2,047,079,556 cycles # 1.844 GHz + 3,258,091,397 instructions # 1.59 insn per cycle + 1.112788129 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index ffde6a98bb..ea9c8935ca 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-26_00:08:08 +DATE: 2024-01-27_19:36:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.325797e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.377030e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.382308e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.345633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.400482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406925e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.463350 sec - 2,011,391,423 cycles # 2.993 GHz - 3,025,832,528 instructions # 1.50 insn per cycle - 0.731209834 seconds time elapsed +TOTAL : 0.467633 sec + 1,992,809,075 cycles # 2.918 GHz + 2,979,298,487 instructions # 1.50 insn per cycle + 0.742813268 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.551198e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.624622e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.627836e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.572479e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649847e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.798766 sec - 6,178,989,401 cycles # 3.055 GHz - 12,447,116,980 instructions # 2.01 insn per cycle - 2.088422920 seconds time elapsed +TOTAL : 1.799926 sec + 6,037,109,549 cycles # 2.977 GHz + 12,003,912,176 instructions # 1.99 insn per cycle + 2.085917224 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.079376e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.080406e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.080406e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.024041e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.025057e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.025057e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 7.897135 sec - 24,196,179,399 cycles # 3.063 GHz - 75,879,578,983 instructions # 3.14 insn per cycle - 7.902000758 seconds time elapsed +TOTAL : 8.112508 sec + 24,227,662,663 cycles # 2.986 GHz + 75,875,626,096 instructions # 3.13 insn per cycle + 8.117324848 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.608061e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.622592e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.622592e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.392521e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.406919e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.406919e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 2.164790 sec - 6,497,805,412 cycles # 2.997 GHz - 20,112,493,788 instructions # 3.10 insn per cycle - 2.169281994 seconds time elapsed +TOTAL : 2.228816 sec + 6,503,950,010 cycles # 2.914 GHz + 20,114,583,883 instructions # 3.09 insn per cycle + 2.233561370 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.717181e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.724647e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.724647e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.648999e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.656276e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.656276e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.963993 sec - 2,817,463,010 cycles # 2.912 GHz - 7,034,615,846 instructions # 2.50 insn per cycle - 0.968756498 seconds time elapsed +TOTAL : 1.004280 sec + 2,822,831,236 cycles # 2.800 GHz + 7,034,381,066 instructions # 2.49 insn per cycle + 1.009123591 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.938377e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.947760e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.947760e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.891851e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.900995e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.900995e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.856285 sec - 2,482,222,517 cycles # 2.887 GHz - 6,277,367,771 instructions # 2.53 insn per cycle - 0.860711348 seconds time elapsed +TOTAL : 0.876735 sec + 2,481,593,013 cycles # 2.816 GHz + 6,275,566,075 instructions # 2.53 insn per cycle + 0.882601647 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.566131e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.572297e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.572297e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.497990e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.503806e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.503806e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.056318 sec - 2,035,165,266 cycles # 1.920 GHz - 3,243,579,392 instructions # 1.59 insn per cycle - 1.060894396 seconds time elapsed +TOTAL : 1.104775 sec + 2,038,939,507 cycles # 1.839 GHz + 3,244,114,954 instructions # 1.59 insn per cycle + 1.109760491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index b15d42b6e4..9df3fb8320 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-26_00:04:48 +DATE: 2024-01-27_19:33:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.331075e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388445e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.393715e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.341989e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.394056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.399853e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.460547 sec - 2,003,713,802 cycles # 2.992 GHz - 2,989,875,393 instructions # 1.49 insn per cycle - 0.728329211 seconds time elapsed +TOTAL : 0.464721 sec + 1,978,581,147 cycles # 2.928 GHz + 2,992,519,083 instructions # 1.51 insn per cycle + 0.734249164 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.563961e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.637851e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.641070e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.563822e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.637597e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.640892e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.744057 sec - 6,014,413,547 cycles # 3.054 GHz - 13,039,651,018 instructions # 2.17 insn per cycle - 2.026578738 seconds time elapsed +TOTAL : 1.748604 sec + 5,868,128,408 cycles # 2.968 GHz + 11,897,078,752 instructions # 2.03 insn per cycle + 2.034152439 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.079528e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.080547e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.080547e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.036622e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.037632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.037632e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.898001 sec - 24,218,623,224 cycles # 3.066 GHz - 75,876,649,489 instructions # 3.13 insn per cycle - 7.902833708 seconds time elapsed +TOTAL : 8.062439 sec + 24,226,666,628 cycles # 3.004 GHz + 75,876,656,478 instructions # 3.13 insn per cycle + 8.067552425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.357606e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.371431e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.371431e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.436843e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.451693e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.451693e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.237158 sec - 6,497,866,331 cycles # 2.900 GHz - 20,114,366,949 instructions # 3.10 insn per cycle - 2.241831657 seconds time elapsed +TOTAL : 2.213994 sec + 6,497,793,089 cycles # 2.930 GHz + 20,114,232,625 instructions # 3.10 insn per cycle + 2.218999244 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.707623e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.714979e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.714979e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.651046e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.657956e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.657956e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.968582 sec - 2,812,838,354 cycles # 2.893 GHz - 7,037,078,857 instructions # 2.50 insn per cycle - 0.973180280 seconds time elapsed +TOTAL : 1.002210 sec + 2,828,228,234 cycles # 2.811 GHz + 7,037,152,943 instructions # 2.49 insn per cycle + 1.006951321 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.914327e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.923385e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.923385e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.896408e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.905353e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.905353e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.865359 sec - 2,481,582,764 cycles # 2.857 GHz - 6,280,640,305 instructions # 2.53 insn per cycle - 0.870009069 seconds time elapsed +TOTAL : 0.873460 sec + 2,478,765,611 cycles # 2.825 GHz + 6,279,360,894 instructions # 2.53 insn per cycle + 0.878454117 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.556930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.563015e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.563015e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.514471e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.520302e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.520302e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.061662 sec - 2,034,090,458 cycles # 1.910 GHz - 3,247,290,037 instructions # 1.60 insn per cycle - 1.066231690 seconds time elapsed +TOTAL : 1.091799 sec + 2,035,517,373 cycles # 1.858 GHz + 3,247,329,615 instructions # 1.60 insn per cycle + 1.096813214 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index c1d0387cb2..bb3df27b99 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-26_00:01:31 +DATE: 2024-01-27_19:30:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,14 +51,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.756369e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.376369e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.382275e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.731672e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.356045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.362013e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.463063 sec - 2,022,094,477 cycles # 3.012 GHz - 3,000,110,742 instructions # 1.48 insn per cycle - 0.730659620 seconds time elapsed +TOTAL : 0.466576 sec + 2,012,354,844 cycles # 2.943 GHz + 2,978,053,730 instructions # 1.48 insn per cycle + 0.741085715 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -71,14 +71,14 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.489937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.624029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.627434e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.444891e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.622956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.626310e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.821843 sec - 6,284,899,485 cycles # 3.070 GHz - 12,329,882,702 instructions # 1.96 insn per cycle - 2.114503584 seconds time elapsed +TOTAL : 1.833515 sec + 6,145,723,719 cycles # 2.985 GHz + 13,015,613,657 instructions # 2.12 insn per cycle + 2.128449848 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -94,14 +94,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.076757e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.077790e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.077790e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.048991e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050067e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050067e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.905540 sec - 24,208,958,642 cycles # 3.061 GHz - 75,876,487,136 instructions # 3.13 insn per cycle - 7.910104097 seconds time elapsed +TOTAL : 8.012669 sec + 24,208,162,061 cycles # 3.020 GHz + 75,876,396,395 instructions # 3.13 insn per cycle + 8.017590721 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe @@ -121,14 +121,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.331082e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.344446e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.344446e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.438541e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.453434e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.453434e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.244902 sec - 6,498,402,067 cycles # 2.890 GHz - 20,114,181,970 instructions # 3.10 insn per cycle - 2.249807684 seconds time elapsed +TOTAL : 2.213920 sec + 6,495,629,253 cycles # 2.929 GHz + 20,114,944,757 instructions # 3.10 insn per cycle + 2.219064819 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe @@ -148,14 +148,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.700366e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.707524e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.707524e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.587815e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.594524e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.594524e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.972557 sec - 2,812,510,471 cycles # 2.881 GHz - 7,036,988,126 instructions # 2.50 insn per cycle - 0.977084151 seconds time elapsed +TOTAL : 1.042010 sec + 2,820,248,202 cycles # 2.696 GHz + 7,037,051,518 instructions # 2.50 insn per cycle + 1.047085469 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe @@ -175,14 +175,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.940115e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.949570e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.949570e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.892048e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.901381e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.901381e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.853172 sec - 2,477,642,132 cycles # 2.892 GHz - 6,279,053,352 instructions # 2.53 insn per cycle - 0.857874216 seconds time elapsed +TOTAL : 0.875630 sec + 2,480,888,017 cycles # 2.821 GHz + 6,279,220,836 instructions # 2.53 insn per cycle + 0.880698893 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe @@ -202,14 +202,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.554146e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.560290e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.560290e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.502673e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.508461e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.508461e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.063738 sec - 2,032,569,225 cycles # 1.904 GHz - 3,247,369,313 instructions # 1.60 insn per cycle - 1.068352560 seconds time elapsed +TOTAL : 1.100164 sec + 2,036,801,052 cycles # 1.845 GHz + 3,247,412,725 instructions # 1.59 insn per cycle + 1.105126246 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 343059fb0d..c584ebcc69 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:10:59 +DATE: 2024-01-27_18:38:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.293388e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.353811e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.360577e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.305087e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.356788e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.363636e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.482578 sec - 2,126,384,282 cycles # 2.996 GHz - 3,122,872,646 instructions # 1.47 insn per cycle - 0.797709837 seconds time elapsed +TOTAL : 0.488004 sec + 2,004,585,151 cycles # 2.823 GHz + 2,949,271,873 instructions # 1.47 insn per cycle + 0.792976067 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.535930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.625556e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.629357e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.500853e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.574463e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.577809e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.724570 sec - 5,989,683,103 cycles # 3.065 GHz - 11,335,601,102 instructions # 1.89 insn per cycle - 2.013935385 seconds time elapsed +TOTAL : 1.725709 sec + 5,827,437,776 cycles # 2.978 GHz + 11,866,403,549 instructions # 2.04 insn per cycle + 2.016529689 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.101149e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.102221e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.102221e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.046853e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.047911e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.047911e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.814813 sec - 24,218,095,116 cycles # 3.098 GHz - 75,804,256,539 instructions # 3.13 insn per cycle - 7.821824558 seconds time elapsed +TOTAL : 8.021570 sec + 24,208,728,571 cycles # 3.017 GHz + 75,801,225,088 instructions # 3.13 insn per cycle + 8.028618453 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.575201e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.590213e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.590213e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.406347e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.420022e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.420022e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.176296 sec - 6,498,546,102 cycles # 2.983 GHz - 20,110,977,060 instructions # 3.09 insn per cycle - 2.186978505 seconds time elapsed +TOTAL : 2.223038 sec + 6,499,735,217 cycles # 2.919 GHz + 20,111,699,203 instructions # 3.09 insn per cycle + 2.235569341 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.705454e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.712850e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.712850e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.666569e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.673138e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.673138e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.969758 sec - 2,811,746,943 cycles # 2.886 GHz - 7,037,673,997 instructions # 2.50 insn per cycle - 0.981392691 seconds time elapsed +TOTAL : 0.993136 sec + 2,812,239,791 cycles # 2.819 GHz + 7,037,687,814 instructions # 2.50 insn per cycle + 1.010358424 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.938719e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.947561e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.947561e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.868663e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.877654e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.877654e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.855569 sec - 2,473,527,129 cycles # 2.880 GHz - 6,280,128,164 instructions # 2.54 insn per cycle - 0.868118493 seconds time elapsed +TOTAL : 0.886341 sec + 2,477,207,877 cycles # 2.780 GHz + 6,280,122,323 instructions # 2.54 insn per cycle + 0.899576679 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.561075e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.567076e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.567076e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.506307e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512102e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512102e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.058909 sec - 2,035,847,836 cycles # 1.915 GHz - 3,247,475,409 instructions # 1.60 insn per cycle - 1.072714857 seconds time elapsed +TOTAL : 1.097481 sec + 2,036,096,996 cycles # 1.847 GHz + 3,247,608,048 instructions # 1.60 insn per cycle + 1.113849823 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 89d748e060..f3cd167bf2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:47:05 +DATE: 2024-01-27_19:15:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.574102e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.615260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.619584e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.585722e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.624773e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.629910e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.484525 sec - 2,126,112,707 cycles # 3.011 GHz - 3,203,427,705 instructions # 1.51 insn per cycle - 0.766056742 seconds time elapsed +TOTAL : 0.491441 sec + 2,090,779,282 cycles # 2.923 GHz + 3,169,249,008 instructions # 1.52 insn per cycle + 0.776379492 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.692622e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.752719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.755365e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.689405e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.750347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.753010e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.852475 sec - 6,402,505,000 cycles # 3.066 GHz - 13,202,162,532 instructions # 2.06 insn per cycle - 2.145533834 seconds time elapsed +TOTAL : 1.857673 sec + 6,257,073,662 cycles # 2.985 GHz + 12,393,758,755 instructions # 1.98 insn per cycle + 2.152860178 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.858562e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.859395e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.859395e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.742749e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.743562e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.743562e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.002108 sec - 86,058,643,411 cycles # 3.074 GHz - 133,995,703,913 instructions # 1.56 insn per cycle - 28.007044175 seconds time elapsed +TOTAL : 28.566959 sec + 86,081,750,957 cycles # 3.014 GHz + 133,992,813,628 instructions # 1.56 insn per cycle + 28.571947196 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.350588e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.364586e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.364586e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.185851e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.199098e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.199098e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.243608 sec - 6,715,678,750 cycles # 2.991 GHz - 19,163,488,382 instructions # 2.85 insn per cycle - 2.248422734 seconds time elapsed +TOTAL : 2.291474 sec + 6,721,144,820 cycles # 2.928 GHz + 19,164,169,698 instructions # 2.85 insn per cycle + 2.297207626 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.522127e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.527994e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.527994e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.422882e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.428217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428217e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.086187 sec - 3,143,488,591 cycles # 2.883 GHz - 6,746,651,283 instructions # 2.15 insn per cycle - 1.091333600 seconds time elapsed +TOTAL : 1.161561 sec + 3,140,365,131 cycles # 2.695 GHz + 6,747,298,343 instructions # 2.15 insn per cycle + 1.166681407 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.848844e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.857285e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.857285e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.799831e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.808210e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.808210e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.895639 sec - 2,605,116,969 cycles # 2.896 GHz - 5,931,025,046 instructions # 2.28 insn per cycle - 0.900480284 seconds time elapsed +TOTAL : 0.920308 sec + 2,609,208,209 cycles # 2.823 GHz + 5,931,137,835 instructions # 2.27 insn per cycle + 0.925334406 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.544175e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.550163e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.550163e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.499022e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.504830e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504830e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.070662 sec - 2,050,751,478 cycles # 1.909 GHz - 3,435,534,795 instructions # 1.68 insn per cycle - 1.075337528 seconds time elapsed +TOTAL : 1.103107 sec + 2,050,125,820 cycles # 1.852 GHz + 3,435,619,830 instructions # 1.68 insn per cycle + 1.108237965 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 2cddbacf89..a8fa5a4097 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:47:57 +DATE: 2024-01-27_19:16:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.556980e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.597321e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.601884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.542946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.582854e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.588303e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.484078 sec - 2,121,900,012 cycles # 3.005 GHz - 3,151,336,456 instructions # 1.49 insn per cycle - 0.765054667 seconds time elapsed +TOTAL : 0.489439 sec + 2,081,558,468 cycles # 2.919 GHz + 3,114,146,189 instructions # 1.50 insn per cycle + 0.773580082 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.644272e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.702490e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.705102e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.658350e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.717978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.720726e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.859096 sec - 6,341,012,252 cycles # 3.030 GHz - 12,731,535,061 instructions # 2.01 insn per cycle - 2.152397231 seconds time elapsed +TOTAL : 1.868545 sec + 6,214,634,155 cycles # 2.941 GHz + 12,997,699,197 instructions # 2.09 insn per cycle + 2.169650397 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.881844e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.882693e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.882693e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.785903e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.786730e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.786730e+02 ) sec^-1 MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 27.890812 sec - 86,057,677,015 cycles # 3.086 GHz - 134,115,301,052 instructions # 1.56 insn per cycle - 27.895583882 seconds time elapsed +TOTAL : 28.353434 sec + 85,553,023,579 cycles # 3.017 GHz + 134,114,238,311 instructions # 1.57 insn per cycle + 28.358305784 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.177991e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.191361e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.191361e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.247256e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.260766e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.260766e+03 ) sec^-1 MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.293306 sec - 6,709,038,006 cycles # 2.921 GHz - 19,223,663,179 instructions # 2.87 insn per cycle - 2.298517207 seconds time elapsed +TOTAL : 2.271681 sec + 6,731,841,131 cycles # 2.958 GHz + 19,223,791,696 instructions # 2.86 insn per cycle + 2.277043695 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.557084e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.563309e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.563309e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.513571e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.519969e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.519969e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.061780 sec - 3,077,526,851 cycles # 2.889 GHz - 6,686,174,538 instructions # 2.17 insn per cycle - 1.066609744 seconds time elapsed +TOTAL : 1.092514 sec + 3,078,739,669 cycles # 2.808 GHz + 6,686,073,097 instructions # 2.17 insn per cycle + 1.097464200 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.838034e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.846640e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.846640e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.791355e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799542e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799542e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.900730 sec - 2,602,110,456 cycles # 2.877 GHz - 5,935,535,699 instructions # 2.28 insn per cycle - 0.905541318 seconds time elapsed +TOTAL : 0.925307 sec + 2,608,678,296 cycles # 2.808 GHz + 5,935,673,927 instructions # 2.28 insn per cycle + 0.930643254 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.543440e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.549672e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.549672e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.496032e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.501544e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501544e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.071317 sec - 2,054,664,150 cycles # 1.911 GHz - 3,422,654,664 instructions # 1.67 insn per cycle - 1.076197262 seconds time elapsed +TOTAL : 1.105011 sec + 2,047,507,476 cycles # 1.846 GHz + 3,422,770,262 instructions # 1.67 insn per cycle + 1.110043886 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 1ce895c12a..e5cbcb4d2f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:11:28 +DATE: 2024-01-27_18:38:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.511599e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.546369e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.548884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.451581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.479933e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.482851e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520763 sec - 2,268,786,464 cycles # 3.011 GHz - 3,443,527,857 instructions # 1.52 insn per cycle - 0.825127284 seconds time elapsed +TOTAL : 0.526837 sec + 2,251,816,336 cycles # 2.931 GHz + 3,479,387,094 instructions # 1.55 insn per cycle + 0.840260948 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.111766e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.145403e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.146782e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.122831e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157050e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.158521e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.044517 sec - 9,929,651,043 cycles # 3.007 GHz - 21,357,445,786 instructions # 2.15 insn per cycle - 3.358909765 seconds time elapsed +TOTAL : 3.051761 sec + 9,894,820,462 cycles # 2.987 GHz + 21,881,954,164 instructions # 2.21 insn per cycle + 3.372451658 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.891705e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.892598e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.892598e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.855146e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856019e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.680046 sec - 26,811,670,006 cycles # 3.088 GHz - 82,457,346,695 instructions # 3.08 insn per cycle - 8.687362012 seconds time elapsed +TOTAL : 8.854089 sec + 26,804,815,526 cycles # 3.027 GHz + 82,458,196,081 instructions # 3.08 insn per cycle + 8.861301901 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.768742e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.772112e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.772112e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.669571e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.672901e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.672901e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.361579 sec - 12,624,387,353 cycles # 2.892 GHz - 38,536,661,841 instructions # 3.05 insn per cycle - 4.376083454 seconds time elapsed +TOTAL : 4.480009 sec + 12,632,329,369 cycles # 2.817 GHz + 38,536,772,938 instructions # 3.05 insn per cycle + 4.492192822 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.633153e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.651561e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.651561e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.441288e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.459099e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.459099e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.909280 sec - 5,542,414,967 cycles # 2.896 GHz - 13,582,418,288 instructions # 2.45 insn per cycle - 1.923129371 seconds time elapsed +TOTAL : 1.952916 sec + 5,537,101,197 cycles # 2.828 GHz + 13,582,628,802 instructions # 2.45 insn per cycle + 1.968634575 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.711541e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.734298e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.734298e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.211711e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.232819e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.232819e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.699762 sec - 4,836,144,559 cycles # 2.839 GHz - 12,109,325,012 instructions # 2.50 insn per cycle - 1.712753610 seconds time elapsed +TOTAL : 1.790592 sec + 4,855,490,852 cycles # 2.715 GHz + 12,114,990,747 instructions # 2.50 insn per cycle + 1.806801715 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.559990e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.574675e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.574675e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.328139e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.341335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.341335e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.179561 sec - 4,095,330,827 cycles # 1.875 GHz - 6,282,635,897 instructions # 1.53 insn per cycle - 2.192150470 seconds time elapsed +TOTAL : 2.249055 sec + 4,112,147,048 cycles # 1.825 GHz + 6,282,902,487 instructions # 1.53 insn per cycle + 2.263248047 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 0c49affce5..67e828539c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-25_23:12:05 +DATE: 2024-01-27_18:39:37 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.483687e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.517313e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.520337e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.479873e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.507682e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.510348e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.518686 sec - 2,258,260,247 cycles # 3.007 GHz - 3,497,479,611 instructions # 1.55 insn per cycle - 0.820930555 seconds time elapsed +TOTAL : 0.527103 sec + 2,250,115,667 cycles # 2.914 GHz + 3,494,224,929 instructions # 1.55 insn per cycle + 0.843300527 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.145573e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.179849e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.181256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.150600e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.185411e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186955e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.021058 sec - 10,039,919,108 cycles # 3.066 GHz - 22,491,054,377 instructions # 2.24 insn per cycle - 3.334343751 seconds time elapsed +TOTAL : 3.025723 sec + 9,795,236,195 cycles # 2.982 GHz + 22,232,844,164 instructions # 2.27 insn per cycle + 3.340401396 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.894529e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.895406e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.895406e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.849609e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.850477e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.850477e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.670860 sec - 26,766,644,461 cycles # 3.087 GHz - 82,359,315,759 instructions # 3.08 insn per cycle - 8.678080922 seconds time elapsed +TOTAL : 8.878906 sec + 26,785,691,422 cycles # 3.017 GHz + 82,362,112,849 instructions # 3.07 insn per cycle + 8.886184258 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.760878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.764183e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.764183e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.594929e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598195e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598195e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.371292 sec - 12,657,966,246 cycles # 2.894 GHz - 38,556,895,723 instructions # 3.05 insn per cycle - 4.383442789 seconds time elapsed +TOTAL : 4.572287 sec + 12,660,755,558 cycles # 2.766 GHz + 38,557,577,434 instructions # 3.05 insn per cycle + 4.586940544 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.688710e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.706749e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.706749e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.414226e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.432226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.432226e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.897561 sec - 5,506,035,803 cycles # 2.895 GHz - 13,595,753,802 instructions # 2.47 insn per cycle - 1.914923405 seconds time elapsed +TOTAL : 1.959907 sec + 5,496,433,543 cycles # 2.798 GHz + 13,598,067,886 instructions # 2.47 insn per cycle + 1.971121515 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.892225e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.917853e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.917853e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.638152e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.661196e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.661196e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.667514 sec - 4,829,490,829 cycles # 2.889 GHz - 12,121,531,237 instructions # 2.51 insn per cycle - 1.679762918 seconds time elapsed +TOTAL : 1.711613 sec + 4,833,583,956 cycles # 2.816 GHz + 12,121,571,130 instructions # 2.51 insn per cycle + 1.724774904 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.714105e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.727872e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.727872e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.463124e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.476511e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.476511e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.135418 sec - 4,088,243,987 cycles # 1.911 GHz - 6,289,128,463 instructions # 1.54 insn per cycle - 2.153209406 seconds time elapsed +TOTAL : 2.207311 sec + 4,092,914,427 cycles # 1.851 GHz + 6,289,060,943 instructions # 1.54 insn per cycle + 2.223904287 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 9bd9f58817..b35a15bb6b 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:14:28 +DATE: 2024-01-27_18:42:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.065787e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.066184e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066303e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.066289e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.066701e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.066888e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.452875 sec - 8,393,388,566 cycles # 3.065 GHz - 18,463,733,578 instructions # 2.20 insn per cycle - 2.847470239 seconds time elapsed +TOTAL : 2.456687 sec + 8,215,788,812 cycles # 2.992 GHz + 18,531,874,132 instructions # 2.26 insn per cycle + 2.855724583 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.253046e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.255279e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.255484e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.233994e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.236219e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.236519e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.992694 sec - 13,259,807,293 cycles # 3.074 GHz - 28,358,770,018 instructions # 2.14 insn per cycle - 4.372097905 seconds time elapsed +TOTAL : 4.000973 sec + 12,930,442,379 cycles # 2.989 GHz + 29,717,455,006 instructions # 2.30 insn per cycle + 4.383843078 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.459684e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.459918e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.459918e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.766343e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.766570e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.766570e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.248663 sec - 18,988,584,595 cycles # 3.040 GHz - 55,181,486,093 instructions # 2.91 insn per cycle - 6.255177410 seconds time elapsed +TOTAL : 6.792343 sec + 18,999,792,923 cycles # 2.797 GHz + 55,183,695,010 instructions # 2.90 insn per cycle + 6.800896289 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.644797e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.644886e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.644886e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.590532e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.590619e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.590619e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.219465 sec - 9,800,820,480 cycles # 3.043 GHz - 27,056,469,438 instructions # 2.76 insn per cycle - 3.238554302 seconds time elapsed +TOTAL : 3.327161 sec + 9,802,926,786 cycles # 2.947 GHz + 27,058,073,971 instructions # 2.76 insn per cycle + 3.343348188 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.604560e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.605011e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.605011e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.407792e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.408212e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.408212e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.472881 sec - 4,237,416,434 cycles # 2.874 GHz - 9,565,677,228 instructions # 2.26 insn per cycle - 1.486737544 seconds time elapsed +TOTAL : 1.558543 sec + 4,283,401,276 cycles # 2.745 GHz + 9,566,262,483 instructions # 2.23 insn per cycle + 1.572820589 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.144612e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.145182e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.145182e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.081323e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.081986e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.081986e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.281750 sec - 3,691,573,169 cycles # 2.879 GHz - 8,451,507,089 instructions # 2.29 insn per cycle - 1.295790607 seconds time elapsed +TOTAL : 1.302229 sec + 3,691,612,964 cycles # 2.831 GHz + 8,451,252,932 instructions # 2.29 insn per cycle + 1.313920154 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.753048e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.753610e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.753610e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.573280e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.573851e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.573851e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.417368 sec - 2,687,064,932 cycles # 1.895 GHz - 4,249,631,483 instructions # 1.58 insn per cycle - 1.428587401 seconds time elapsed +TOTAL : 1.491172 sec + 2,701,853,841 cycles # 1.809 GHz + 4,249,729,716 instructions # 1.57 insn per cycle + 1.510680407 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index d14df13526..397fb214c3 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:56:53 +DATE: 2024-01-27_19:25:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.066232e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.067169e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.067169e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.063308e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.064264e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.064264e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.360921 sec - 8,190,424,340 cycles # 3.057 GHz - 18,278,004,261 instructions # 2.23 insn per cycle - 2.736136962 seconds time elapsed +TOTAL : 2.377552 sec + 7,911,457,318 cycles # 2.939 GHz + 17,776,798,693 instructions # 2.25 insn per cycle + 2.754050489 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.223953e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.258483e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.258483e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.220144e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.252068e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.252068e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.978912 sec - 13,124,273,934 cycles # 3.053 GHz - 28,270,976,701 instructions # 2.15 insn per cycle - 4.357372054 seconds time elapsed +TOTAL : 3.978881 sec + 12,946,564,273 cycles # 3.008 GHz + 28,230,768,091 instructions # 2.18 insn per cycle + 4.358810997 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.439754e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.439992e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.439992e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.364296e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.364564e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.364564e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.260228 sec - 18,994,991,555 cycles # 3.033 GHz - 55,179,721,591 instructions # 2.90 insn per cycle - 6.264831927 seconds time elapsed +TOTAL : 6.317070 sec + 19,020,611,022 cycles # 3.010 GHz + 55,181,937,523 instructions # 2.90 insn per cycle + 6.322014476 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667946e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.668036e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.668036e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.612783e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.612876e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.612876e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.170506 sec - 9,795,795,334 cycles # 3.086 GHz - 27,055,731,512 instructions # 2.76 insn per cycle - 3.175238264 seconds time elapsed +TOTAL : 3.279226 sec + 9,851,493,440 cycles # 3.001 GHz + 27,056,657,499 instructions # 2.75 insn per cycle + 3.284240204 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.606417e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.606863e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.606863e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.513583e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.514021e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.514021e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.470037 sec - 4,243,836,214 cycles # 2.880 GHz - 9,565,087,053 instructions # 2.25 insn per cycle - 1.474653489 seconds time elapsed +TOTAL : 1.509181 sec + 4,246,413,990 cycles # 2.806 GHz + 9,565,193,031 instructions # 2.25 insn per cycle + 1.514258491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.145885e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.146454e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.146454e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.030244e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.030835e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.030835e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.280022 sec - 3,687,476,774 cycles # 2.873 GHz - 8,450,693,643 instructions # 2.29 insn per cycle - 1.284626579 seconds time elapsed +TOTAL : 1.315714 sec + 3,690,233,076 cycles # 2.797 GHz + 8,450,714,839 instructions # 2.29 insn per cycle + 1.320771644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.747174e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.747808e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.747808e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.617080e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.617634e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617634e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.414790 sec - 2,682,388,196 cycles # 1.891 GHz - 4,248,704,675 instructions # 1.58 insn per cycle - 1.419412677 seconds time elapsed +TOTAL : 1.468596 sec + 2,687,627,626 cycles # 1.828 GHz + 4,249,748,844 instructions # 1.58 insn per cycle + 1.473675650 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index b9e0b80718..5295435e83 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:15:32 +DATE: 2024-01-27_18:43:09 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.069962e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.070348e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.070466e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.071223e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.071654e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.071839e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.452966 sec - 8,392,911,107 cycles # 3.064 GHz - 17,663,923,974 instructions # 2.10 insn per cycle - 2.847032927 seconds time elapsed +TOTAL : 2.430534 sec + 8,241,056,474 cycles # 2.996 GHz + 17,668,710,833 instructions # 2.14 insn per cycle + 2.817128458 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.276690e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.279063e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.279281e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.234923e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.237215e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.237486e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.985081 sec - 13,287,476,031 cycles # 3.079 GHz - 30,163,648,592 instructions # 2.27 insn per cycle - 4.374483515 seconds time elapsed +TOTAL : 3.996420 sec + 12,989,143,492 cycles # 3.004 GHz + 30,886,524,025 instructions # 2.38 insn per cycle + 4.378619453 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.571320e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.571572e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.571572e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.363577e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.363813e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.363813e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.171909 sec - 18,889,408,426 cycles # 3.061 GHz - 55,158,086,845 instructions # 2.92 insn per cycle - 6.178617455 seconds time elapsed +TOTAL : 6.317358 sec + 18,899,007,109 cycles # 2.990 GHz + 55,157,851,644 instructions # 2.92 insn per cycle + 6.322297725 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.672519e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.672641e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.672641e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.616667e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.616762e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.616762e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.167587 sec - 9,791,706,585 cycles # 3.092 GHz - 27,064,728,203 instructions # 2.76 insn per cycle - 3.180382611 seconds time elapsed +TOTAL : 3.272043 sec + 9,878,457,696 cycles # 3.015 GHz + 27,063,242,954 instructions # 2.74 insn per cycle + 3.284292649 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.614879e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.615320e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.615320e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.539659e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.540140e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.540140e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.471483 sec - 4,258,635,939 cycles # 2.892 GHz - 9,569,764,673 instructions # 2.25 insn per cycle - 1.483574248 seconds time elapsed +TOTAL : 1.497660 sec + 4,224,561,051 cycles # 2.813 GHz + 9,568,643,351 instructions # 2.27 insn per cycle + 1.508422323 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.103182e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.103803e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.103803e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.989197e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.989799e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.989799e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.296986 sec - 3,734,622,853 cycles # 2.878 GHz - 8,454,833,333 instructions # 2.26 insn per cycle - 1.308768086 seconds time elapsed +TOTAL : 1.328783 sec + 3,741,136,636 cycles # 2.806 GHz + 8,454,541,804 instructions # 2.26 insn per cycle + 1.342771339 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.743925e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.744483e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.744483e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.611038e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611578e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611578e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.417381 sec - 2,690,766,593 cycles # 1.895 GHz - 4,250,770,661 instructions # 1.58 insn per cycle - 1.429533524 seconds time elapsed +TOTAL : 1.470091 sec + 2,681,169,729 cycles # 1.819 GHz + 4,250,171,659 instructions # 1.59 insn per cycle + 1.480911575 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8f44531fc7..c5db4e21ff 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:16:35 +DATE: 2024-01-27_18:44:13 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.764615e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.765451e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.765700e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.761759e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.762767e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.763202e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.693480 sec - 5,887,378,168 cycles # 3.045 GHz - 12,371,650,873 instructions # 2.10 insn per cycle - 2.055112195 seconds time elapsed +TOTAL : 1.665455 sec + 5,816,169,406 cycles # 2.981 GHz + 12,222,800,082 instructions # 2.10 insn per cycle + 2.007943438 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.347276e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.348063e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.348165e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.318123e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.318911e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319041e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.911661 sec - 6,533,877,574 cycles # 2.984 GHz - 12,896,572,372 instructions # 1.97 insn per cycle - 2.245605774 seconds time elapsed +TOTAL : 1.920955 sec + 6,578,417,034 cycles # 2.994 GHz + 13,330,489,318 instructions # 2.03 insn per cycle + 2.257185780 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.267727e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.268016e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.268016e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.995520e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.995824e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.995824e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.704766 sec - 17,594,191,683 cycles # 3.083 GHz - 51,786,449,538 instructions # 2.94 insn per cycle - 5.712781183 seconds time elapsed +TOTAL : 5.875855 sec + 17,760,593,846 cycles # 3.021 GHz + 51,788,652,435 instructions # 2.92 insn per cycle + 5.880892508 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.581700e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.582144e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.582144e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.478616e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.479057e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.479057e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.482036 sec - 4,536,858,867 cycles # 3.058 GHz - 13,760,139,415 instructions # 3.03 insn per cycle - 1.496097222 seconds time elapsed +TOTAL : 1.522886 sec + 4,542,905,565 cycles # 2.976 GHz + 13,759,097,003 instructions # 3.03 insn per cycle + 1.528290141 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.282359e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.284137e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.284137e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.037647e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.039406e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.039406e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.733254 sec - 2,135,927,022 cycles # 2.909 GHz - 4,827,375,326 instructions # 2.26 insn per cycle - 0.746092791 seconds time elapsed +TOTAL : 0.756451 sec + 2,139,969,692 cycles # 2.814 GHz + 4,825,813,699 instructions # 2.26 insn per cycle + 0.761605193 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.194354e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.196559e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.196559e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.942628e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.944917e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.944917e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.653785 sec - 1,887,673,749 cycles # 2.887 GHz - 4,259,987,023 instructions # 2.26 insn per cycle - 0.666506032 seconds time elapsed +TOTAL : 0.670660 sec + 1,891,175,209 cycles # 2.803 GHz + 4,258,254,106 instructions # 2.25 insn per cycle + 0.675764794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.532413e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.534737e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.534737e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.220293e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.222452e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.222452e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.709782 sec - 1,351,081,905 cycles # 1.902 GHz - 2,148,573,752 instructions # 1.59 insn per cycle - 0.723580289 seconds time elapsed +TOTAL : 0.738034 sec + 1,362,584,884 cycles # 1.836 GHz + 2,147,140,575 instructions # 1.58 insn per cycle + 0.743262931 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index d30e3f5e8c..d666735ca2 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:57:56 +DATE: 2024-01-27_19:26:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gp Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.804220e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.806054e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.806054e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.806421e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.808416e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.808416e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.599978 sec - 5,690,698,341 cycles # 3.038 GHz - 11,869,605,294 instructions # 2.09 insn per cycle - 1.930855145 seconds time elapsed +TOTAL : 1.603735 sec + 5,618,263,113 cycles # 2.992 GHz + 11,482,646,019 instructions # 2.04 insn per cycle + 1.935006282 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.357820e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.371717e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.371717e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.340764e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.354569e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.354569e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.854511 sec - 6,501,890,517 cycles # 3.055 GHz - 13,932,068,655 instructions # 2.14 insn per cycle - 2.184917483 seconds time elapsed +TOTAL : 1.886823 sec + 6,496,931,993 cycles # 3.000 GHz + 13,298,151,535 instructions # 2.05 insn per cycle + 2.222816329 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.275864e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.276185e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.276185e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.077212e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.077512e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.077512e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.697274 sec - 17,573,288,349 cycles # 3.083 GHz - 51,786,308,864 instructions # 2.95 insn per cycle - 5.702038900 seconds time elapsed +TOTAL : 5.822432 sec + 17,590,382,766 cycles # 3.020 GHz + 51,787,107,788 instructions # 2.94 insn per cycle + 5.827233952 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.423474e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.423884e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.423884e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.522700e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.523145e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.523145e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.547633 sec - 4,547,457,091 cycles # 2.942 GHz - 13,762,480,283 instructions # 3.03 insn per cycle - 1.552657618 seconds time elapsed +TOTAL : 1.505118 sec + 4,557,873,954 cycles # 3.020 GHz + 13,759,118,019 instructions # 3.02 insn per cycle + 1.510488242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.199242e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.200979e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.200979e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.001081e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.002839e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.002839e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.738658 sec - 2,137,172,645 cycles # 2.879 GHz - 4,826,755,817 instructions # 2.26 insn per cycle - 0.743342702 seconds time elapsed +TOTAL : 0.760817 sec + 2,141,009,943 cycles # 2.800 GHz + 4,826,771,994 instructions # 2.25 insn per cycle + 0.765894845 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.219482e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.221665e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.221665e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.013615e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.015939e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.015939e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.647740 sec - 1,878,162,742 cycles # 2.884 GHz - 4,259,215,943 instructions # 2.27 insn per cycle - 0.652438590 seconds time elapsed +TOTAL : 0.664865 sec + 1,881,921,417 cycles # 2.814 GHz + 4,259,199,583 instructions # 2.26 insn per cycle + 0.669793712 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.494885e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.497437e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.497437e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.319978e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.322349e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.322349e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.709876 sec - 1,351,727,984 cycles # 1.894 GHz - 2,148,012,718 instructions # 1.59 insn per cycle - 0.714595281 seconds time elapsed +TOTAL : 0.728687 sec + 1,353,147,876 cycles # 1.847 GHz + 2,147,964,410 instructions # 1.59 insn per cycle + 0.733659134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index b474950e2a..1452fdeeca 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:17:22 +DATE: 2024-01-27_18:45:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.767740e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.768599e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.768840e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.763283e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.764164e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.764726e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.689539 sec - 5,904,330,680 cycles # 3.057 GHz - 12,131,099,993 instructions # 2.05 insn per cycle - 2.040870008 seconds time elapsed +TOTAL : 1.666806 sec + 5,766,686,507 cycles # 2.966 GHz + 11,530,148,505 instructions # 2.00 insn per cycle + 2.001410376 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.326271e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.327058e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.327155e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.328340e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.329144e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.329276e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.919073 sec - 6,721,403,827 cycles # 3.061 GHz - 13,909,706,030 instructions # 2.07 insn per cycle - 2.252811447 seconds time elapsed +TOTAL : 1.924938 sec + 6,578,277,944 cycles # 2.973 GHz + 13,443,420,241 instructions # 2.04 insn per cycle + 2.272640588 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.194831e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.195117e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.195117e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.038426e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.038714e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.038714e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.760749 sec - 17,569,361,335 cycles # 3.054 GHz - 51,761,329,976 instructions # 2.95 insn per cycle - 5.767466141 seconds time elapsed +TOTAL : 5.850564 sec + 17,645,414,933 cycles # 3.015 GHz + 51,759,467,370 instructions # 2.93 insn per cycle + 5.855310396 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.611039e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.611537e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.611537e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.517615e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.518094e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.518094e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.472926 sec - 4,539,290,938 cycles # 3.083 GHz - 13,757,920,525 instructions # 3.03 insn per cycle - 1.488362352 seconds time elapsed +TOTAL : 1.507049 sec + 4,549,371,148 cycles # 3.011 GHz + 13,756,628,094 instructions # 3.02 insn per cycle + 1.512083566 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.269368e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.271114e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.271114e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.072859e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.074649e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.074649e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.735827 sec - 2,125,786,473 cycles # 2.890 GHz - 4,826,595,787 instructions # 2.27 insn per cycle - 0.748208117 seconds time elapsed +TOTAL : 0.752766 sec + 2,125,890,254 cycles # 2.809 GHz + 4,825,175,981 instructions # 2.27 insn per cycle + 0.757781550 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.835657e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.837931e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.837931e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.077641e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.080007e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.080007e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.682597 sec - 1,877,053,974 cycles # 2.749 GHz - 4,259,114,516 instructions # 2.27 insn per cycle - 0.697649288 seconds time elapsed +TOTAL : 0.660009 sec + 1,859,970,750 cycles # 2.801 GHz + 4,257,370,977 instructions # 2.29 insn per cycle + 0.665023060 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.527647e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.530226e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.530226e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.821540e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.823711e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.823711e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.709194 sec - 1,355,297,540 cycles # 1.909 GHz - 2,148,050,912 instructions # 1.58 insn per cycle - 0.723531266 seconds time elapsed +TOTAL : 0.780669 sec + 1,354,191,609 cycles # 1.726 GHz + 2,146,469,836 instructions # 1.59 insn per cycle + 0.785741588 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 8d2448ca7c..d5b3f2a192 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:18:09 +DATE: 2024-01-27_18:45:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.696404e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.696917e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.697059e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.695527e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.696068e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.696270e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.203037 sec - 7,518,198,042 cycles # 3.016 GHz - 16,792,821,670 instructions # 2.23 insn per cycle - 2.605923217 seconds time elapsed +TOTAL : 2.173361 sec + 7,464,389,868 cycles # 2.992 GHz + 15,636,784,035 instructions # 2.09 insn per cycle + 2.552006424 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.109293e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.109610e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.109640e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.114011e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.114348e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114390e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.398117 sec - 11,465,669,090 cycles # 3.081 GHz - 24,712,533,321 instructions # 2.16 insn per cycle - 3.778631492 seconds time elapsed +TOTAL : 3.404734 sec + 11,172,869,025 cycles # 2.990 GHz + 24,656,254,293 instructions # 2.21 insn per cycle + 3.795433378 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.507104e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.507413e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.507413e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.240396e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.240663e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.240663e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.220283 sec - 19,249,910,006 cycles # 3.096 GHz - 55,389,631,331 instructions # 2.88 insn per cycle - 6.227350768 seconds time elapsed +TOTAL : 6.423863 sec + 19,317,508,811 cycles # 3.006 GHz + 55,390,994,540 instructions # 2.87 insn per cycle + 6.429032251 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.636577e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.636668e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.636668e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.584594e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.584688e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.584688e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.235386 sec - 9,345,666,755 cycles # 2.888 GHz - 25,874,626,472 instructions # 2.77 insn per cycle - 3.246263387 seconds time elapsed +TOTAL : 3.340379 sec + 9,375,006,047 cycles # 2.805 GHz + 25,873,963,058 instructions # 2.76 insn per cycle + 3.345266262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.840519e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.841057e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.841057e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.735273e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.735783e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.735783e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.382711 sec - 3,999,150,321 cycles # 2.888 GHz - 9,120,253,087 instructions # 2.28 insn per cycle - 1.394370004 seconds time elapsed +TOTAL : 1.419524 sec + 4,002,951,295 cycles # 2.813 GHz + 9,118,794,093 instructions # 2.28 insn per cycle + 1.424775624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.372720e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.373367e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.373367e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.294481e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.295107e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.295107e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.214906 sec - 3,512,659,413 cycles # 2.887 GHz - 8,030,227,038 instructions # 2.29 insn per cycle - 1.227544699 seconds time elapsed +TOTAL : 1.235904 sec + 3,506,973,693 cycles # 2.829 GHz + 8,028,693,608 instructions # 2.29 insn per cycle + 1.240980401 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.939019e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.939632e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.939632e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.758822e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.759478e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.759478e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.349746 sec - 2,597,586,119 cycles # 1.922 GHz - 4,075,915,870 instructions # 1.57 insn per cycle - 1.364260117 seconds time elapsed +TOTAL : 1.411257 sec + 2,599,112,782 cycles # 1.836 GHz + 4,074,815,519 instructions # 1.57 insn per cycle + 1.416720153 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 0ea566946e..d966c21cae 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-25_23:19:10 +DATE: 2024-01-27_18:46:50 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.694468e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695070e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695213e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.691341e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691858e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.692072e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.171307 sec - 7,653,950,084 cycles # 3.075 GHz - 15,514,206,763 instructions # 2.03 insn per cycle - 2.552702695 seconds time elapsed +TOTAL : 2.174115 sec + 7,484,543,519 cycles # 2.999 GHz + 16,704,419,809 instructions # 2.23 insn per cycle + 2.552906187 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.105262e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.105578e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.105614e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.104165e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.104486e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.104527e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.405437 sec - 11,415,976,705 cycles # 3.063 GHz - 24,971,465,951 instructions # 2.19 insn per cycle - 3.786382890 seconds time elapsed +TOTAL : 3.414511 sec + 11,184,338,121 cycles # 2.992 GHz + 24,773,735,470 instructions # 2.22 insn per cycle + 3.797778444 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.498313e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.498549e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.498549e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.732242e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.732454e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.732454e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.221756 sec - 19,207,086,158 cycles # 3.086 GHz - 55,417,722,841 instructions # 2.89 insn per cycle - 6.226328389 seconds time elapsed +TOTAL : 6.829477 sec + 19,209,542,583 cycles # 2.812 GHz + 55,420,380,527 instructions # 2.89 insn per cycle + 6.834484266 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.599487e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.599575e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.599575e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.609123e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.609214e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.609214e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.303650 sec - 9,295,885,471 cycles # 2.810 GHz - 25,822,717,776 instructions # 2.78 insn per cycle - 3.314418026 seconds time elapsed +TOTAL : 3.289973 sec + 9,309,252,956 cycles # 2.828 GHz + 25,822,376,754 instructions # 2.77 insn per cycle + 3.295080287 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.826926e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.827419e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.827419e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.752434e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752989e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752989e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.384736 sec - 3,999,912,467 cycles # 2.881 GHz - 9,099,410,094 instructions # 2.27 insn per cycle - 1.394578954 seconds time elapsed +TOTAL : 1.414298 sec + 3,995,880,255 cycles # 2.817 GHz + 9,098,295,505 instructions # 2.28 insn per cycle + 1.419531270 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.408309e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.408938e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.408938e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.317543e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.318199e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.318199e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.203548 sec - 3,478,041,980 cycles # 2.880 GHz - 8,009,965,268 instructions # 2.30 insn per cycle - 1.214113762 seconds time elapsed +TOTAL : 1.229231 sec + 3,482,074,372 cycles # 2.824 GHz + 8,009,633,949 instructions # 2.30 insn per cycle + 1.234207236 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.878573e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.879207e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.879207e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.719747e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.720359e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.720359e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.367393 sec - 2,595,773,085 cycles # 1.893 GHz - 4,065,040,917 instructions # 1.57 insn per cycle - 1.378860552 seconds time elapsed +TOTAL : 1.425822 sec + 2,595,022,817 cycles # 1.815 GHz + 4,064,590,341 instructions # 1.57 insn per cycle + 1.430892684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 190eac2ebb..4462f3455a 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:12:42 +DATE: 2024-01-27_18:40:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.660405e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.252171e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.620562e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.639719e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.238275e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.606410e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445028 sec - 1,996,271,131 cycles # 3.002 GHz - 2,802,769,990 instructions # 1.40 insn per cycle - 0.740314258 seconds time elapsed +TOTAL : 0.450396 sec + 1,947,310,059 cycles # 2.928 GHz + 2,755,033,645 instructions # 1.41 insn per cycle + 0.740603136 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.248695e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.090608e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.502306e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.241115e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.110220e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.531234e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.528546 sec - 2,313,689,489 cycles # 3.026 GHz - 3,291,661,091 instructions # 1.42 insn per cycle - 0.824068617 seconds time elapsed +TOTAL : 0.533761 sec + 2,265,806,755 cycles # 2.927 GHz + 3,221,324,415 instructions # 1.42 insn per cycle + 0.832968038 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.058020e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.079223e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.079223e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.020352e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042032e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.042032e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.570332 sec - 4,875,303,116 cycles # 3.097 GHz - 13,800,130,515 instructions # 2.83 insn per cycle - 1.577794250 seconds time elapsed +TOTAL : 1.629272 sec + 4,894,825,472 cycles # 2.998 GHz + 13,801,188,692 instructions # 2.82 insn per cycle + 1.636458250 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.033034e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.112454e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.112454e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.977778e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.057292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.057292e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.827517 sec - 2,561,602,187 cycles # 3.079 GHz - 7,401,027,646 instructions # 2.89 insn per cycle - 0.842056783 seconds time elapsed +TOTAL : 0.851626 sec + 2,574,234,654 cycles # 3.006 GHz + 7,401,126,330 instructions # 2.88 insn per cycle + 0.868169150 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.412853e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.634825e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.634825e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.313825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.541459e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541459e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501571 sec - 1,470,508,906 cycles # 2.907 GHz - 3,136,855,793 instructions # 2.13 insn per cycle - 0.517188475 seconds time elapsed +TOTAL : 0.517367 sec + 1,479,430,994 cycles # 2.833 GHz + 3,136,844,595 instructions # 2.12 insn per cycle + 0.530727828 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.868351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.162767e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162767e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.745507e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.030695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.030695e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.445204 sec - 1,306,234,962 cycles # 2.906 GHz - 2,923,536,957 instructions # 2.24 insn per cycle - 0.456330410 seconds time elapsed +TOTAL : 0.460291 sec + 1,313,909,527 cycles # 2.825 GHz + 2,923,525,061 instructions # 2.23 insn per cycle + 0.475368269 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.674321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.813205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.813205e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.584731e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.722315e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.722315e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.636437 sec - 1,265,525,690 cycles # 1.974 GHz - 1,899,766,661 instructions # 1.50 insn per cycle - 0.650714621 seconds time elapsed +TOTAL : 0.658817 sec + 1,274,030,369 cycles # 1.920 GHz + 1,899,828,951 instructions # 1.49 insn per cycle + 0.672755043 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index dcc832b3ed..fa3b42477f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:55:12 +DATE: 2024-01-27_19:23:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.666168e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.155134e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.155134e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.534750e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.106106e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.106106e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.470776 sec - 2,054,044,928 cycles # 2.998 GHz - 3,046,852,590 instructions # 1.48 insn per cycle - 0.744657602 seconds time elapsed +TOTAL : 0.477418 sec + 2,013,690,496 cycles # 2.928 GHz + 3,000,584,425 instructions # 1.49 insn per cycle + 0.747869861 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.291770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.278677e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.278677e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.202690e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.270294e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.270294e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.751856 sec - 3,008,653,088 cycles # 3.008 GHz - 4,577,018,873 instructions # 1.52 insn per cycle - 1.059330773 seconds time elapsed +TOTAL : 0.760124 sec + 2,966,783,718 cycles # 2.930 GHz + 4,516,593,875 instructions # 1.52 insn per cycle + 1.070193173 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.050038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.071441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.071441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.021921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.043570e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.587680 sec - 4,905,788,321 cycles # 3.082 GHz - 13,807,026,481 instructions # 2.81 insn per cycle - 1.592794582 seconds time elapsed +TOTAL : 1.633650 sec + 4,930,350,150 cycles # 3.012 GHz + 13,805,734,405 instructions # 2.80 insn per cycle + 1.639181796 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.094006e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.094006e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.966319e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.048340e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.048340e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.842101 sec - 2,593,834,262 cycles # 3.065 GHz - 7,449,872,135 instructions # 2.87 insn per cycle - 0.847204065 seconds time elapsed +TOTAL : 0.863129 sec + 2,612,967,702 cycles # 3.012 GHz + 7,448,123,812 instructions # 2.85 insn per cycle + 0.868429334 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.388938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.615019e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.615019e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.268267e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.501838e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501838e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.512487 sec - 1,503,564,380 cycles # 2.909 GHz - 3,186,663,979 instructions # 2.12 insn per cycle - 0.517541584 seconds time elapsed +TOTAL : 0.533513 sec + 1,524,823,949 cycles # 2.836 GHz + 3,186,924,139 instructions # 2.09 insn per cycle + 0.538790954 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.812650e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.096476e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.096476e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.690302e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.973964e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.973964e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.458891 sec - 1,341,696,556 cycles # 2.900 GHz - 2,973,096,768 instructions # 2.22 insn per cycle - 0.464005147 seconds time elapsed +TOTAL : 0.474007 sec + 1,354,698,256 cycles # 2.832 GHz + 2,971,821,426 instructions # 2.19 insn per cycle + 0.479578735 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.676739e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.814197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.814197e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.556123e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.696455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.696455e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.642150 sec - 1,294,550,001 cycles # 2.004 GHz - 1,936,842,258 instructions # 1.50 insn per cycle - 0.647156528 seconds time elapsed +TOTAL : 0.674208 sec + 1,321,719,887 cycles # 1.949 GHz + 1,936,933,525 instructions # 1.47 insn per cycle + 0.679606999 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index c7ee729841..f8687d6f23 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:13:00 +DATE: 2024-01-27_18:40:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.661110e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.191439e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.542241e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.642634e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.223079e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.583744e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.447354 sec - 1,962,226,451 cycles # 2.919 GHz - 2,807,157,696 instructions # 1.43 insn per cycle - 0.740877930 seconds time elapsed +TOTAL : 0.452291 sec + 1,896,382,083 cycles # 2.843 GHz + 2,657,764,456 instructions # 1.40 insn per cycle + 0.741104609 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.268430e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.027340e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.439288e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.219834e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.980842e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.384273e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.527954 sec - 2,295,849,167 cycles # 2.992 GHz - 3,276,865,701 instructions # 1.43 insn per cycle - 0.824712620 seconds time elapsed +TOTAL : 0.535658 sec + 2,270,552,244 cycles # 2.929 GHz + 3,223,240,823 instructions # 1.42 insn per cycle + 0.833108106 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.058942e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.080298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.080298e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.030327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.051785e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.051785e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.568275 sec - 4,868,810,341 cycles # 3.097 GHz - 13,807,135,682 instructions # 2.84 insn per cycle - 1.575280536 seconds time elapsed +TOTAL : 1.612977 sec + 4,885,882,007 cycles # 3.021 GHz + 13,807,686,866 instructions # 2.83 insn per cycle + 1.620453863 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.039912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.118197e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.118197e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.984699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.065832e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.065832e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.824293 sec - 2,558,754,803 cycles # 3.088 GHz - 7,406,533,816 instructions # 2.89 insn per cycle - 0.837018986 seconds time elapsed +TOTAL : 0.848511 sec + 2,571,840,623 cycles # 3.014 GHz + 7,406,641,051 instructions # 2.88 insn per cycle + 0.865785135 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.376928e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.600038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.600038e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.247727e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.466178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466178e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.507142 sec - 1,477,740,773 cycles # 2.889 GHz - 3,137,433,888 instructions # 2.12 insn per cycle - 0.518098986 seconds time elapsed +TOTAL : 0.527497 sec + 1,486,490,353 cycles # 2.792 GHz + 3,137,529,820 instructions # 2.11 insn per cycle + 0.543813521 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.840459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.127687e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.127687e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.763651e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.053882e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.053882e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.448212 sec - 1,306,019,831 cycles # 2.885 GHz - 2,925,357,996 instructions # 2.24 insn per cycle - 0.463433458 seconds time elapsed +TOTAL : 0.458301 sec + 1,313,225,777 cycles # 2.836 GHz + 2,925,437,911 instructions # 2.23 insn per cycle + 0.472801819 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.679673e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.821382e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.821382e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.598173e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.735760e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.735760e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.635102 sec - 1,265,111,451 cycles # 1.978 GHz - 1,899,601,810 instructions # 1.50 insn per cycle - 0.647364895 seconds time elapsed +TOTAL : 0.655276 sec + 1,273,182,235 cycles # 1.928 GHz + 1,899,785,192 instructions # 1.49 insn per cycle + 0.668056369 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index c7d9af7104..255696cfca 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:13:18 +DATE: 2024-01-27_18:40:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.458768e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.221749e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.363529e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.276030e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197789e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.334861e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.437838 sec - 1,957,049,768 cycles # 2.996 GHz - 2,769,299,997 instructions # 1.42 insn per cycle - 0.724675379 seconds time elapsed +TOTAL : 0.446424 sec + 1,957,975,833 cycles # 2.902 GHz + 2,724,877,606 instructions # 1.39 insn per cycle + 0.752539197 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.255427e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.817568e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.965420e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.177695e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.803706e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957397e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.480738 sec - 2,090,911,516 cycles # 2.954 GHz - 2,970,490,097 instructions # 1.42 insn per cycle - 0.765001287 seconds time elapsed +TOTAL : 0.477570 sec + 2,061,739,652 cycles # 2.925 GHz + 2,934,253,634 instructions # 1.42 insn per cycle + 0.762376186 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.184289e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.211974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.211974e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.156011e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183795e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183795e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.403480 sec - 4,337,769,666 cycles # 3.083 GHz - 12,596,317,616 instructions # 2.90 insn per cycle - 1.410558942 seconds time elapsed +TOTAL : 1.439630 sec + 4,349,945,055 cycles # 3.014 GHz + 12,597,057,594 instructions # 2.90 insn per cycle + 1.446297084 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.321037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.548912e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.548912e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.236464e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.466077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.466077e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.513015 sec - 1,589,233,522 cycles # 3.072 GHz - 4,246,299,525 instructions # 2.67 insn per cycle - 0.528221565 seconds time elapsed +TOTAL : 0.527817 sec + 1,596,321,277 cycles # 2.995 GHz + 4,246,542,438 instructions # 2.66 insn per cycle + 0.539092243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.129047e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.905203e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.905203e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.734911e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.446877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.446877e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287410 sec - 849,542,990 cycles # 2.910 GHz - 1,915,700,197 instructions # 2.25 insn per cycle - 0.298653599 seconds time elapsed +TOTAL : 0.307736 sec + 852,622,477 cycles # 2.728 GHz + 1,915,871,925 instructions # 2.25 insn per cycle + 0.320337070 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.711177e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.657175e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.657175e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.511200e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.444345e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.444345e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.264085 sec - 779,082,359 cycles # 2.903 GHz - 1,797,571,945 instructions # 2.31 insn per cycle - 0.277627025 seconds time elapsed +TOTAL : 0.272725 sec + 782,291,543 cycles # 2.820 GHz + 1,797,558,112 instructions # 2.30 insn per cycle + 0.287215058 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.019451e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.552029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.552029e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.868013e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.379295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.379295e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.348741 sec - 718,379,002 cycles # 2.035 GHz - 1,287,825,813 instructions # 1.79 insn per cycle - 0.361391527 seconds time elapsed +TOTAL : 0.359796 sec + 720,473,569 cycles # 1.977 GHz + 1,287,790,296 instructions # 1.79 insn per cycle + 0.372596873 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index b3562e3bb2..8680fe2d29 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:55:30 +DATE: 2024-01-27_19:24:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -54,14 +54,14 @@ WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.713802e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058372e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058372e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.329665e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.932243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.932243e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.450219 sec - 1,998,063,586 cycles # 3.001 GHz - 2,936,698,123 instructions # 1.47 insn per cycle - 0.725182400 seconds time elapsed +TOTAL : 0.458470 sec + 1,932,748,670 cycles # 2.884 GHz + 2,880,300,848 instructions # 1.49 insn per cycle + 0.747483766 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -80,14 +80,14 @@ WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublo Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.246442e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.591827e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.591827e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.128345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.590470e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.590470e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.614457 sec - 2,546,512,744 cycles # 3.012 GHz - 3,859,457,418 instructions # 1.52 insn per cycle - 0.903348815 seconds time elapsed +TOTAL : 0.625994 sec + 2,512,334,849 cycles # 2.925 GHz + 3,779,505,778 instructions # 1.50 insn per cycle + 0.916809729 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -104,14 +104,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.185133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213037e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213037e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.152331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.180767e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.180767e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.405427 sec - 4,352,574,944 cycles # 3.089 GHz - 12,601,058,264 instructions # 2.90 insn per cycle - 1.410092171 seconds time elapsed +TOTAL : 1.447405 sec + 4,369,343,989 cycles # 3.011 GHz + 12,600,604,701 instructions # 2.88 insn per cycle + 1.452464327 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe @@ -132,14 +132,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.310454e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.536044e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536044e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.208026e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.437849e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.437849e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.518604 sec - 1,608,829,031 cycles # 3.079 GHz - 4,293,781,748 instructions # 2.67 insn per cycle - 0.523462288 seconds time elapsed +TOTAL : 0.537081 sec + 1,624,227,785 cycles # 3.000 GHz + 4,293,772,611 instructions # 2.64 insn per cycle + 0.542420637 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe @@ -160,14 +160,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.559224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.260897e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.260897e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.879547e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.646923e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.646923e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.321312 sec - 876,133,160 cycles # 2.693 GHz - 1,951,948,663 instructions # 2.23 insn per cycle - 0.326491770 seconds time elapsed +TOTAL : 0.304262 sec + 874,283,993 cycles # 2.836 GHz + 1,951,967,000 instructions # 2.23 insn per cycle + 0.309391709 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe @@ -188,14 +188,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.592091e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.515294e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.515294e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.999359e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.842417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.842417e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.272487 sec - 796,404,987 cycles # 2.881 GHz - 1,834,071,112 instructions # 2.30 insn per cycle - 0.277371242 seconds time elapsed +TOTAL : 0.300020 sec + 806,296,956 cycles # 2.668 GHz + 1,834,830,287 instructions # 2.28 insn per cycle + 0.305527883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe @@ -216,14 +216,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.062069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.591594e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.591594e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.823582e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.321298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.321298e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.349762 sec - 735,881,550 cycles # 2.080 GHz - 1,329,099,019 instructions # 1.81 insn per cycle - 0.354618508 seconds time elapsed +TOTAL : 0.367386 sec + 744,117,486 cycles # 2.002 GHz + 1,329,029,664 instructions # 1.79 insn per cycle + 0.372710510 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 6336756753..490050e744 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:13:35 +DATE: 2024-01-27_18:41:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.370025e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209260e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.344655e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.294931e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189738e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.329663e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.438443 sec - 1,955,337,784 cycles # 2.997 GHz - 2,770,402,702 instructions # 1.42 insn per cycle - 0.721946226 seconds time elapsed +TOTAL : 0.445614 sec + 1,926,682,589 cycles # 2.902 GHz + 2,707,428,922 instructions # 1.41 insn per cycle + 0.733083595 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.164313e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.773172e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.912434e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.149926e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.777474e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.917051e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.474687 sec - 2,104,150,807 cycles # 2.998 GHz - 2,986,785,400 instructions # 1.42 insn per cycle - 0.760875003 seconds time elapsed +TOTAL : 0.479694 sec + 2,060,682,910 cycles # 2.910 GHz + 2,931,090,456 instructions # 1.42 insn per cycle + 0.766292085 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.186035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.213662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.213662e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.155890e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183617e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183617e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.401162 sec - 4,337,750,504 cycles # 3.088 GHz - 12,587,552,850 instructions # 2.90 insn per cycle - 1.407863482 seconds time elapsed +TOTAL : 1.441481 sec + 4,347,023,152 cycles # 3.011 GHz + 12,588,009,166 instructions # 2.90 insn per cycle + 1.448735748 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.324278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.556123e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.556123e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.253491e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.483392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.483392e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.512613 sec - 1,585,533,498 cycles # 3.068 GHz - 4,241,255,263 instructions # 2.67 insn per cycle - 0.528427282 seconds time elapsed +TOTAL : 0.524401 sec + 1,588,950,296 cycles # 3.004 GHz + 4,240,918,979 instructions # 2.67 insn per cycle + 0.538341289 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.124568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.922590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.922590e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.961223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.731117e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.731117e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287324 sec - 845,792,185 cycles # 2.901 GHz - 1,913,732,633 instructions # 2.26 insn per cycle - 0.297586352 seconds time elapsed +TOTAL : 0.295877 sec + 848,904,121 cycles # 2.823 GHz + 1,913,779,976 instructions # 2.25 insn per cycle + 0.307372896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.758792e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.702266e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.702266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.485658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.411446e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.411446e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.261995 sec - 775,948,492 cycles # 2.911 GHz - 1,795,856,064 instructions # 2.31 insn per cycle - 0.275868058 seconds time elapsed +TOTAL : 0.275232 sec + 779,943,899 cycles # 2.803 GHz + 1,795,594,447 instructions # 2.30 insn per cycle + 0.289346704 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.998817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.512870e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.512870e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.833560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.329755e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.329755e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.349663 sec - 716,382,396 cycles # 2.024 GHz - 1,286,477,839 instructions # 1.80 insn per cycle - 0.363209331 seconds time elapsed +TOTAL : 0.362099 sec + 720,251,915 cycles # 1.963 GHz + 1,286,597,647 instructions # 1.79 insn per cycle + 0.375243448 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index ab23ae8079..864c5b4dac 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:13:52 +DATE: 2024-01-27_18:41:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.673057e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.328050e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.712475e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.679353e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.341805e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.729214e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.442690 sec - 1,984,783,264 cycles # 3.005 GHz - 2,793,428,342 instructions # 1.41 insn per cycle - 0.734598388 seconds time elapsed +TOTAL : 0.448080 sec + 1,940,047,979 cycles # 2.920 GHz + 2,742,937,870 instructions # 1.41 insn per cycle + 0.735393353 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.268703e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.159984e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.588880e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.226696e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.100972e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.538337e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.529813 sec - 2,295,133,443 cycles # 3.001 GHz - 3,278,190,514 instructions # 1.43 insn per cycle - 0.824550304 seconds time elapsed +TOTAL : 0.540123 sec + 2,295,280,774 cycles # 2.915 GHz + 3,248,742,569 instructions # 1.42 insn per cycle + 0.847439533 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.925762e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013036e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.025332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.046638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046638e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.674491 sec - 4,902,858,967 cycles # 2.922 GHz - 13,824,814,632 instructions # 2.82 insn per cycle - 1.682421523 seconds time elapsed +TOTAL : 1.621376 sec + 4,906,088,168 cycles # 3.018 GHz + 13,824,588,217 instructions # 2.82 insn per cycle + 1.628687956 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.985930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.061989e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.061989e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.920741e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.996876e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996876e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.846811 sec - 2,593,448,297 cycles # 3.047 GHz - 7,349,153,746 instructions # 2.83 insn per cycle - 0.862989336 seconds time elapsed +TOTAL : 0.875767 sec + 2,604,975,465 cycles # 2.959 GHz + 7,349,296,033 instructions # 2.82 insn per cycle + 0.890044499 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.415408e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.638451e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.638451e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.212351e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.429946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.429946e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501362 sec - 1,467,201,166 cycles # 2.901 GHz - 3,084,117,020 instructions # 2.10 insn per cycle - 0.513174865 seconds time elapsed +TOTAL : 0.533775 sec + 1,473,910,132 cycles # 2.737 GHz + 3,084,284,378 instructions # 2.09 insn per cycle + 0.547758581 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.707680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.990043e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.990043e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.852658e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.155181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.155181e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.464354 sec - 1,278,148,137 cycles # 2.726 GHz - 2,873,006,643 instructions # 2.25 insn per cycle - 0.476021326 seconds time elapsed +TOTAL : 0.448502 sec + 1,285,781,151 cycles # 2.837 GHz + 2,873,225,261 instructions # 2.23 insn per cycle + 0.462355639 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.601061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.728349e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.728349e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.496967e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.624427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.624427e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.653676 sec - 1,303,858,267 cycles # 1.981 GHz - 1,914,918,168 instructions # 1.47 insn per cycle - 0.665372239 seconds time elapsed +TOTAL : 0.681772 sec + 1,314,048,592 cycles # 1.914 GHz + 1,914,956,895 instructions # 1.46 insn per cycle + 0.694401401 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index a6c3b7ce72..4ce5d2d103 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-25_23:14:10 +DATE: 2024-01-27_18:41:45 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -50,14 +50,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.648457e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.178614e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.521473e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.642077e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.178835e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.540093e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444086 sec - 1,991,914,568 cycles # 2.998 GHz - 2,768,343,966 instructions # 1.39 insn per cycle - 0.739601436 seconds time elapsed +TOTAL : 0.449788 sec + 1,941,972,721 cycles # 2.918 GHz + 2,738,636,267 instructions # 1.41 insn per cycle + 0.738186725 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -68,14 +68,14 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.227832e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.974511e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.384988e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.213581e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.983987e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.394706e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.526758 sec - 2,298,196,221 cycles # 2.993 GHz - 3,278,717,607 instructions # 1.43 insn per cycle - 0.825350264 seconds time elapsed +TOTAL : 0.539583 sec + 2,245,163,074 cycles # 2.866 GHz + 3,192,763,021 instructions # 1.42 insn per cycle + 0.840841647 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -91,14 +91,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.051083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072032e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072032e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.021933e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.043806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.043806e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.580057 sec - 4,897,834,610 cycles # 3.092 GHz - 13,831,057,972 instructions # 2.82 insn per cycle - 1.587486400 seconds time elapsed +TOTAL : 1.626260 sec + 4,912,245,932 cycles # 3.013 GHz + 13,831,784,678 instructions # 2.82 insn per cycle + 1.633425184 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe @@ -118,14 +118,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.991090e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.066032e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.066032e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.952614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.030072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.030072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.844299 sec - 2,597,084,036 cycles # 3.060 GHz - 7,352,270,683 instructions # 2.83 insn per cycle - 0.857215046 seconds time elapsed +TOTAL : 0.861728 sec + 2,610,889,602 cycles # 3.013 GHz + 7,352,459,897 instructions # 2.82 insn per cycle + 0.877694167 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe @@ -145,14 +145,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.410530e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.636744e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.636744e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.319451e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.546025e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.546025e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.502125 sec - 1,464,778,292 cycles # 2.890 GHz - 3,085,005,315 instructions # 2.11 insn per cycle - 0.516998713 seconds time elapsed +TOTAL : 0.516383 sec + 1,474,990,397 cycles # 2.829 GHz + 3,084,581,300 instructions # 2.09 insn per cycle + 0.532151264 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe @@ -172,14 +172,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.928508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.233519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.233519e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.830634e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.128730e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.128730e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.438651 sec - 1,279,923,095 cycles # 2.887 GHz - 2,874,967,565 instructions # 2.25 insn per cycle - 0.449214976 seconds time elapsed +TOTAL : 0.450724 sec + 1,286,646,934 cycles # 2.823 GHz + 2,874,817,797 instructions # 2.23 insn per cycle + 0.463104569 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe @@ -199,14 +199,14 @@ Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.608751e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738501e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738501e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.498045e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.624903e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.624903e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.651858 sec - 1,302,154,592 cycles # 1.984 GHz - 1,915,377,243 instructions # 1.47 insn per cycle - 0.663461748 seconds time elapsed +TOTAL : 0.681753 sec + 1,315,133,016 cycles # 1.917 GHz + 1,915,783,936 instructions # 1.46 insn per cycle + 0.692521331 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe From f724e69515af91e12bae47b1313ce4622cd9e8ff Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 28 Jan 2024 01:08:01 +0100 Subject: [PATCH 71/96] [jt744] rerun 18 tmad tests on itscrd90, all ok STARTED AT Sat Jan 27 07:37:21 PM CET 2024 ENDED AT Sat Jan 27 11:56:46 PM CET 2024 Status=0 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 136 +++--- .../log_eemumu_mad_f_inl0_hrd0.txt | 132 +++--- .../log_eemumu_mad_m_inl0_hrd0.txt | 136 +++--- .../log_ggtt_mad_d_inl0_hrd0.txt | 392 ++++++++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 136 +++--- .../log_ggtt_mad_m_inl0_hrd0.txt | 134 +++--- .../log_ggttg_mad_d_inl0_hrd0.txt | 136 +++--- .../log_ggttg_mad_f_inl0_hrd0.txt | 140 +++---- .../log_ggttg_mad_m_inl0_hrd0.txt | 136 +++--- .../log_ggttgg_mad_d_inl0_hrd0.txt | 138 +++--- .../log_ggttgg_mad_f_inl0_hrd0.txt | 136 +++--- .../log_ggttgg_mad_m_inl0_hrd0.txt | 134 +++--- .../log_ggttggg_mad_d_inl0_hrd0.txt | 138 +++--- .../log_ggttggg_mad_f_inl0_hrd0.txt | 136 +++--- .../log_ggttggg_mad_m_inl0_hrd0.txt | 136 +++--- .../log_gqttq_mad_d_inl0_hrd0.txt | 138 +++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 134 +++--- .../log_gqttq_mad_m_inl0_hrd0.txt | 134 +++--- 18 files changed, 1425 insertions(+), 1277 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 44df599739..2c75ea3bc5 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -3,10 +3,10 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -16,16 +16,16 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:15:24 +DATE: 2024-01-27_19:44:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6077s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5995s - [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6027s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1723s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1639s - [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1731s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1650s + [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3235s - [COUNTERS] Fortran MEs ( 1 ) : 0.0875s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4157s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3266s + [COUNTERS] Fortran MEs ( 1 ) : 0.0891s for 90112 events => throughput is 1.01E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1830s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1737s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4111s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0755s for 90112 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4118s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3360s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0758s for 90112 events => throughput is 1.19E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.137053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.120640e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138245e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.164378e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1762s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1720s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.98E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1740s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1700s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.04E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3745s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3311s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 90112 events => throughput is 2.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3312s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 90112 events => throughput is 2.04E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.954813e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.983144e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.021256e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.061610e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1716s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1687s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.82E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1752s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3639s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3313s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0326s for 90112 events => throughput is 2.77E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3655s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 90112 events => throughput is 2.73E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.643708e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615407e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.780965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.773647e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1714s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1687s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1713s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1685s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3612s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3308s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0304s for 90112 events => throughput is 2.96E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3653s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3344s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0309s for 90112 events => throughput is 2.92E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.919552e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.802242e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.939142e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082983e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1763s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1732s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1758s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3723s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3369s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0354s for 90112 events => throughput is 2.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3730s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3371s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0359s for 90112 events => throughput is 2.51E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350967e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.322806e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.506114e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.501885e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6076s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6070s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7561s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.85E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.149558e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.938455e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.952056e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.994935e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699705e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.746082e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.444629e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.494025e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.729024e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728254e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.979970e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.067943e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.739033e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.726308e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.144973e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.130699e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index bbf79e30e3..990ba27411 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -16,9 +16,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' @@ -26,6 +25,7 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:15:41 +DATE: 2024-01-27_19:44:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6033s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5953s - [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5988s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1722s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1642s - [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1733s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1651s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3253s - [COUNTERS] Fortran MEs ( 1 ) : 0.0873s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4192s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s + [COUNTERS] Fortran MEs ( 1 ) : 0.0892s for 90112 events => throughput is 1.01E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1860s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1794s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1785s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3355s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0714s for 90112 events => throughput is 1.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4078s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0722s for 90112 events => throughput is 1.25E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.219752e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.252721e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.242681e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.203917e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1729s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1704s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.16E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1716s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1691s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3582s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0272s for 90112 events => throughput is 3.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3580s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 90112 events => throughput is 3.28E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.111730e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.171921e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.395043e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.403675e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1717s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1695s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1713s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.54E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3554s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3312s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 90112 events => throughput is 3.73E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3566s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3324s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 90112 events => throughput is 3.72E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.590394e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.720332e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.759019e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.793683e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1720s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1699s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1675s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.00E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3589s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3361s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3526s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3301s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 90112 events => throughput is 3.99E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.954153e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.903694e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.260132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.107507e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1746s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1725s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1730s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1710s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.93E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3577s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3338s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0239s for 90112 events => throughput is 3.77E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3583s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3343s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0241s for 90112 events => throughput is 3.74E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.760450e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.576234e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.994942e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.897428e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5864s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.67E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5913s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5909s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.73E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7531s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7484s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0047s for 90112 events => throughput is 1.93E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7560s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7514s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.99E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.617084e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.588173e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.934097e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.068280e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.941825e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.025338e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.057960e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.843391e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.888538e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.945293e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.240921e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.084064e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.397983e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406384e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.459127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.442610e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 7bcac1c93d..d8843cfcaf 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-26_00:15:58 +DATE: 2024-01-27_19:44:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6192s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6109s - [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6055s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5973s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.92E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1748s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1660s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1734s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1653s + [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,8 +109,8 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4153s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3282s + [COUNTERS] PROGRAM TOTAL : 0.4103s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3232s [COUNTERS] Fortran MEs ( 1 ) : 0.0871s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1832s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1792s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1723s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4303s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3512s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0791s for 90112 events => throughput is 1.14E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4114s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3347s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0768s for 90112 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152102e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.105584e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.149985e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117429e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,8 +210,8 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1761s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] PROGRAM TOTAL : 0.1775s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1736s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.09E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3322s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 90112 events => throughput is 2.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3970s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0457s for 90112 events => throughput is 1.97E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.061680e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.981868e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.075365e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.039596e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1719s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1814s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1781s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0340s for 90112 events => throughput is 2.65E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3748s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3394s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0355s for 90112 events => throughput is 2.54E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.542279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.559591e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.701939e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.712795e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1710s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.80E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1737s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3322s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0312s for 90112 events => throughput is 2.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3602s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0314s for 90112 events => throughput is 2.87E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.678475e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.762723e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.985776e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.892375e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1715s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1717s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.52E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3706s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3353s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0353s for 90112 events => throughput is 2.55E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3687s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3338s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0349s for 90112 events => throughput is 2.58E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.336735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336676e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.551256e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.591242e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5877s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5872s + [COUNTERS] PROGRAM TOTAL : 0.5862s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5857s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7586s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7537s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.83E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7518s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.202928e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.180766e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.960653e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.997610e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.733664e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.732650e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.450045e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.502916e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728190e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.713329e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.031495e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.957660e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.704272e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723587e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.151802e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.165534e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 92fac99c28..e36af37e5f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-27_16:01:47 +DATE: 2024-01-27_19:45:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3288s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2931s - [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3611s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3195s + [COUNTERS] Fortran MEs ( 1 ) : 0.0416s for 8192 events => throughput is 1.97E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2612s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2256s - [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2721s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4043s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0148s - [COUNTERS] Fortran MEs ( 1 ) : 0.3895s for 90112 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7588s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3044s + [COUNTERS] Fortran MEs ( 1 ) : 0.4544s for 90112 events => throughput is 1.98E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3026s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2706s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3453s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.25E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4012s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3518s for 90112 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7197s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3176s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4021s for 90112 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.610898e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.232130e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.592454e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.251215e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2670s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0178s for 8192 events => throughput is 4.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3154s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2942s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2278s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0323s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1955s for 90112 events => throughput is 4.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5256s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2953s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2303s for 90112 events => throughput is 3.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.735718e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.814370e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.759032e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.785236e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2403s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 8.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2969s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.32E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1386s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0262s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1124s for 90112 events => throughput is 8.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4419s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2958s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1460s for 90112 events => throughput is 6.17E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.251333e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.858187e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.010499e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.2943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2828s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0115s for 8192 events => throughput is 7.12E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.4134s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2891s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1243s for 90112 events => throughput is 7.25E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.053396e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.283006e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.009768e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3049s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2876s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.4908s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2991s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1917s for 90112 events => throughput is 4.70E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.480612e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.440487e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.8015s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8008s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6959s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6953s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3066s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2989s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7123s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7059s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.540459e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.071758e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.038013e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.732801e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.788135e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.007423e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.755971e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.069820e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.786660e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.015390e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.950104e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.148579e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.754388e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.007149e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.139605e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.022635e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 9213f67fa2..7cae848e03 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:16:42 +DATE: 2024-01-27_19:45:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3598s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3172s - [COUNTERS] Fortran MEs ( 1 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3124s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3118s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2712s - [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2713s + [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7504s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2981s - [COUNTERS] Fortran MEs ( 1 ) : 0.4524s for 90112 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7516s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3006s + [COUNTERS] Fortran MEs ( 1 ) : 0.4510s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3540s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3175s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0365s for 8192 events => throughput is 2.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3394s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3051s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 8192 events => throughput is 2.39E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7108s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3324s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3784s for 90112 events => throughput is 2.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7178s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3353s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3825s for 90112 events => throughput is 2.36E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.393568e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.325661e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.399982e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.297389e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3034s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2896s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3049s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2907s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0142s for 8192 events => throughput is 5.75E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4532s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2989s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1542s for 90112 events => throughput is 5.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4512s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2955s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1558s for 90112 events => throughput is 5.79E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.691481e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.252620e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.737589e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.683567e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2902s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2857s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2780s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3754s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2908s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 90112 events => throughput is 1.07E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3745s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2887s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0859s for 90112 events => throughput is 1.05E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.029810e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061022e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.032448e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.028900e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2868s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2814s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.13E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4669s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3835s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0835s for 90112 events => throughput is 1.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3627s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2856s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0770s for 90112 events => throughput is 1.17E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.125495e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.158067e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.131924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.165706e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2972s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2918s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0097s for 8192 events => throughput is 8.41E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3978s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2918s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1060s for 90112 events => throughput is 8.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4029s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1081s for 90112 events => throughput is 8.34E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.016243e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.810936e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.906782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.099694e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6945s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6940s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.49E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.50E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6975s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6921s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.65E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7137s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.68E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.316183e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269476e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.020050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.034760e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.761145e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.781597e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.782582e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784418e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.827139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.842537e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.877247e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.885569e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.384483e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.378680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.425158e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.411797e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 9f96fafdb5..2815e7f120 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -4,9 +4,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:17:08 +DATE: 2024-01-27_19:45:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3516s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s - [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3539s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3117s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2709s - [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3146s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2734s + [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7545s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3018s - [COUNTERS] Fortran MEs ( 1 ) : 0.4527s for 90112 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7482s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2971s + [COUNTERS] Fortran MEs ( 1 ) : 0.4511s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3463s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3097s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3469s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3100s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7234s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4075s for 90112 events => throughput is 2.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7258s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4085s for 90112 events => throughput is 2.21E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.206337e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.184678e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.189630e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.165824e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3187s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0222s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3191s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2978s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5255s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2996s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2260s for 90112 events => throughput is 3.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5422s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3131s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2290s for 90112 events => throughput is 3.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.918959e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.941267e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.840099e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.879138e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2968s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3149s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3011s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4387s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2942s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1446s for 90112 events => throughput is 6.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4314s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1424s for 90112 events => throughput is 6.33E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.099865e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.012934e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.108305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.079905e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2943s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2829s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0114s for 8192 events => throughput is 7.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2965s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2852s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.30E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4171s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2934s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1237s for 90112 events => throughput is 7.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4615s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3338s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1277s for 90112 events => throughput is 7.06E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.031927e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.121805e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.243269e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.148011e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3125s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2951s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0174s for 8192 events => throughput is 4.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3039s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2870s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0170s for 8192 events => throughput is 4.83E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5025s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3112s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1913s for 90112 events => throughput is 4.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1879s for 90112 events => throughput is 4.80E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.533015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.564065e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.733679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.610241e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 1.0435s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0430s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6981s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6975s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.36E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7112s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7048s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.7132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.076384e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.073063e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.666577e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.653827e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.000653e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.014949e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.068544e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052887e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006931e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019887e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138201e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.139444e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.024897e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.021175e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.036764e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.985737e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 7b087a9357..850335f2f4 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:17:34 +DATE: 2024-01-27_19:46:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5536s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2268s - [COUNTERS] Fortran MEs ( 1 ) : 0.3268s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5540s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2283s + [COUNTERS] Fortran MEs ( 1 ) : 0.3257s for 8192 events => throughput is 2.51E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2237s - [COUNTERS] Fortran MEs ( 1 ) : 0.3244s for 8192 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2312s + [COUNTERS] Fortran MEs ( 1 ) : 0.3319s for 8192 events => throughput is 2.47E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0349s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4462s - [COUNTERS] Fortran MEs ( 1 ) : 3.5888s for 90112 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0352s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4496s + [COUNTERS] Fortran MEs ( 1 ) : 3.5856s for 90112 events => throughput is 2.51E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8619s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5404s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3215s for 8192 events => throughput is 2.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8598s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5399s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3199s for 8192 events => throughput is 2.56E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3489s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8053s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5436s for 90112 events => throughput is 2.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3221s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7954s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5267s for 90112 events => throughput is 2.56E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.641138e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.617618e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.627463e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.613965e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5508s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3861s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1647s for 8192 events => throughput is 4.97E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5563s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3905s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1658s for 8192 events => throughput is 4.94E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4428s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6330s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8098s for 90112 events => throughput is 4.98E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4634s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6410s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8224s for 90112 events => throughput is 4.94E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.058143e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996868e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.038892e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.053710e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3083s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0842s for 8192 events => throughput is 9.73E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3922s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0834s for 8192 events => throughput is 9.82E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6361s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9771s for 90112 events => throughput is 9.22E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5000s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5700s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9299s for 90112 events => throughput is 9.69E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.785412e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.422787e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.879871e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.441130e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3121s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3732s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2998s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0733s for 8192 events => throughput is 1.12E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3674s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5551s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8123s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3727s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5593s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8134s for 90112 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.103525e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.138049e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.136073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.127596e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4279s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0990s for 8192 events => throughput is 8.28E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4269s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3276s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0993s for 8192 events => throughput is 8.25E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6797s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5881s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0916s for 90112 events => throughput is 8.26E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7063s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6040s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1023s for 90112 events => throughput is 8.17E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.387308e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.583150e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.236833e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.778299e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6620s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6566s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6826s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6772s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9222s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8995s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 90112 events => throughput is 3.97E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9316s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.601715e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631849e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.887746e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.320240e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.653087e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.650532e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245673e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.243044e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.662678e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.668471e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.253047e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.251938e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.646319e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.686782e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.762775e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760801e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e703809b3b..c136750f78 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,12 +15,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' @@ -30,10 +28,12 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-26_00:18:17 +DATE: 2024-01-27_19:47:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5495s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s - [COUNTERS] Fortran MEs ( 1 ) : 0.3261s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5520s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2244s + [COUNTERS] Fortran MEs ( 1 ) : 0.3276s for 8192 events => throughput is 2.50E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5449s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2206s - [COUNTERS] Fortran MEs ( 1 ) : 0.3243s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5508s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2222s + [COUNTERS] Fortran MEs ( 1 ) : 0.3287s for 8192 events => throughput is 2.49E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0316s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4517s - [COUNTERS] Fortran MEs ( 1 ) : 3.5800s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0147s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4381s + [COUNTERS] Fortran MEs ( 1 ) : 3.5766s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8182s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5191s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2991s for 8192 events => throughput is 2.74E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8287s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5252s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3035s for 8192 events => throughput is 2.70E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0612s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7703s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.2909s for 90112 events => throughput is 2.74E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0655s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7727s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.2929s for 90112 events => throughput is 2.74E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.800786e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.841164e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.824381e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.820269e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4071s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0907s for 8192 events => throughput is 9.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3160s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0908s for 8192 events => throughput is 9.02E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6363s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6051s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0312s for 90112 events => throughput is 8.74E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6104s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5891s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0213s for 90112 events => throughput is 8.82E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.146202e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.971196e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.112833e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.175070e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3139s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0434s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2702s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9925s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5174s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4751s for 90112 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0125s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5327s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4797s for 90112 events => throughput is 1.88E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.939208e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.933582e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.905454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.916894e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3016s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2639s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0378s for 8192 events => throughput is 2.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2634s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 8192 events => throughput is 2.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9295s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5118s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4177s for 90112 events => throughput is 2.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9418s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5246s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4172s for 90112 events => throughput is 2.16E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.197972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.157480e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.117265e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.150998e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2773s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0499s for 8192 events => throughput is 1.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2785s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0498s for 8192 events => throughput is 1.64E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0855s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5387s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5468s for 90112 events => throughput is 1.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0744s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5309s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5435s for 90112 events => throughput is 1.66E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.655429e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622322e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.665602e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6521s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.75E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6500s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6492s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.77E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9153s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9058s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 90112 events => throughput is 9.47E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.9168s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 90112 events => throughput is 9.43E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.322135e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318884e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.860681e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.859848e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.641206e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.636314e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.474666e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.386714e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.648721e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.667329e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.520248e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.485862e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.509475e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.522474e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.623003e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.621873e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 26bc9b342a..b6c36d66b2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:18:54 +DATE: 2024-01-27_19:47:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5858s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2389s - [COUNTERS] Fortran MEs ( 1 ) : 0.3469s for 8192 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5608s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2342s + [COUNTERS] Fortran MEs ( 1 ) : 0.3266s for 8192 events => throughput is 2.51E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5858s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2386s - [COUNTERS] Fortran MEs ( 1 ) : 0.3472s for 8192 events => throughput is 2.36E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5478s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2222s + [COUNTERS] Fortran MEs ( 1 ) : 0.3256s for 8192 events => throughput is 2.52E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0544s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4643s - [COUNTERS] Fortran MEs ( 1 ) : 3.5901s for 90112 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0324s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4446s + [COUNTERS] Fortran MEs ( 1 ) : 3.5877s for 90112 events => throughput is 2.51E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8767s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5495s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3272s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8800s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5505s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3295s for 8192 events => throughput is 2.49E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3981s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7975s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6006s for 90112 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.5826s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8660s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7166s for 90112 events => throughput is 2.42E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.554031e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.565163e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.558500e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.555528e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5755s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1753s for 8192 events => throughput is 4.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5536s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3890s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1646s for 8192 events => throughput is 4.98E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4487s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6335s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8152s for 90112 events => throughput is 4.96E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4702s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6500s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8202s for 90112 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.133506e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.041058e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.159016e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.031890e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3987s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0831s for 8192 events => throughput is 9.85E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4037s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0837s for 8192 events => throughput is 9.79E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4721s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5556s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9166s for 90112 events => throughput is 9.83E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5175s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9387s for 90112 events => throughput is 9.60E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.006516e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.991486e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.013910e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002523e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2955s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0733s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3693s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0720s for 8192 events => throughput is 1.14E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3524s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5518s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8005s for 90112 events => throughput is 1.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3486s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5473s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8012s for 90112 events => throughput is 1.12E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.156908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.143636e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.161564e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4316s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3308s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1008s for 8192 events => throughput is 8.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4314s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1025s for 8192 events => throughput is 7.99E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7119s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5851s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1268s for 90112 events => throughput is 8.00E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7050s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5842s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1208s for 90112 events => throughput is 8.04E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.071079e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.096015e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.063980e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.161407e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6512s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6648s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6594s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.53E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,8 +547,8 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9224s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8996s + [COUNTERS] PROGRAM TOTAL : 1.9297s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9069s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 90112 events => throughput is 3.96E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.623843e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.616391e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.126033e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.900480e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.628988e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.604902e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232507e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.635974e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.596050e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243999e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240430e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.613847e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.614528e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.712232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.710810e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 305f74d40c..460087d609 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,8 +1,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:19:37 +DATE: 2024-01-27_19:48:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2847s - [COUNTERS] Fortran MEs ( 1 ) : 4.1385s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5619s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2948s + [COUNTERS] Fortran MEs ( 1 ) : 4.2671s for 8192 events => throughput is 1.92E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4823s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2803s - [COUNTERS] Fortran MEs ( 1 ) : 4.2021s for 8192 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2929s + [COUNTERS] Fortran MEs ( 1 ) : 4.2755s for 8192 events => throughput is 1.92E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.7198s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9740s - [COUNTERS] Fortran MEs ( 1 ) : 45.7459s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 48.4190s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0019s + [COUNTERS] Fortran MEs ( 1 ) : 46.4171s for 90112 events => throughput is 1.94E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.8779s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5273s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3506s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.2836s + [COUNTERS] Fortran Overhead ( 0 ) : 4.6935s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.5901s for 8192 events => throughput is 1.78E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 54.2755s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1909s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.0846s for 90112 events => throughput is 1.87E+03 events/s + [COUNTERS] PROGRAM TOTAL : 54.7612s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3690s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.3922s for 90112 events => throughput is 1.86E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.936301e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.924912e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.940526e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925523e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.6979s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4561s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2418s for 8192 events => throughput is 3.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7140s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4722s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2419s for 8192 events => throughput is 3.65E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.8152s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1172s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.6980s for 90112 events => throughput is 3.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.9457s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1382s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8075s for 90112 events => throughput is 3.63E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.802169e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.848056e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.823296e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.844500e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2047s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2324s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9723s for 8192 events => throughput is 8.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.2048s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2272s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9776s for 8192 events => throughput is 8.38E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.6182s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8737s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7445s for 90112 events => throughput is 8.39E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6330s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8910s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7420s for 90112 events => throughput is 8.39E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.612716e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.617417e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.609695e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.595180e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.9770s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1128s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8641s for 8192 events => throughput is 9.48E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.9765s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1174s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8591s for 8192 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.1580s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7559s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4022s for 90112 events => throughput is 9.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.2161s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7678s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4483s for 90112 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.847095e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.643052e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.837457e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.811387e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5594s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4371s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1223s for 8192 events => throughput is 7.30E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4115s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3312s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0803s for 8192 events => throughput is 7.58E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.6611s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9896s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.6715s for 90112 events => throughput is 7.72E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.8837s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0662s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.8175s for 90112 events => throughput is 7.63E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.812824e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.806363e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.833625e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.813477e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7739s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0327s for 8192 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8100s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7769s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7969s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4368s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3600s for 90112 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7903s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4265s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.283605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291315e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.518944e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.527796e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.111225e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105251e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.157335e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.152979e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.104619e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.106581e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.145376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.161275e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.107697e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.111001e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.434237e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.431708e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index c44aa866bb..89beafa1ac 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,9 +1,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -15,11 +15,8 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. @@ -28,12 +25,15 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-26_00:23:52 +DATE: 2024-01-27_19:52:40 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s - [COUNTERS] Fortran MEs ( 1 ) : 4.1512s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2799s + [COUNTERS] Fortran MEs ( 1 ) : 4.1419s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2811s - [COUNTERS] Fortran MEs ( 1 ) : 4.2067s for 8192 events => throughput is 1.95E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4317s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2764s + [COUNTERS] Fortran MEs ( 1 ) : 4.1552s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.5920s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9624s - [COUNTERS] Fortran MEs ( 1 ) : 45.6297s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.8157s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9738s + [COUNTERS] Fortran MEs ( 1 ) : 45.8419s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.1703s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1746s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.9957s for 8192 events => throughput is 2.05E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.1660s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1682s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9978s for 8192 events => throughput is 2.05E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 50.0883s - [COUNTERS] Fortran Overhead ( 0 ) : 5.9292s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.1591s for 90112 events => throughput is 2.04E+03 events/s + [COUNTERS] PROGRAM TOTAL : 49.9973s + [COUNTERS] Fortran Overhead ( 0 ) : 5.8319s + [COUNTERS] CudaCpp MEs ( 2 ) : 44.1654s for 90112 events => throughput is 2.04E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.109904e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.110751e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.108623e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.113663e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4881s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3708s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1173s for 8192 events => throughput is 7.33E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4883s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3686s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1197s for 8192 events => throughput is 7.32E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.5148s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0458s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.4690s for 90112 events => throughput is 7.23E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.6256s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0669s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.5587s for 90112 events => throughput is 7.18E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.588414e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.593109e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.664199e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.603205e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2367s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7493s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4874s for 8192 events => throughput is 1.68E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.2497s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7566s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4931s for 8192 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.7878s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4039s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.3838s for 90112 events => throughput is 1.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.8293s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4062s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4231s for 90112 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722525e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.703817e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.724823e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.707084e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -363,8 +363,8 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) [COUNTERS] PROGRAM TOTAL : 1.1297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6985s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4312s for 8192 events => throughput is 1.90E+04 events/s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6970s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4327s for 8192 events => throughput is 1.89E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.0756s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3360s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7397s for 90112 events => throughput is 1.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.1100s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3460s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7640s for 90112 events => throughput is 1.89E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.960961e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.947996e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.950503e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.958466e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5245s for 8192 events => throughput is 1.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5268s for 8192 events => throughput is 1.56E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.2280s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4440s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.7840s for 90112 events => throughput is 1.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.2491s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4431s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8060s for 90112 events => throughput is 1.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.578889e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565172e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.586434e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.569351e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7712s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7499s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0213s for 8192 events => throughput is 3.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7749s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7535s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0215s for 8192 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.6112s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3770s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2343s for 90112 events => throughput is 3.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6309s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3945s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2365s for 90112 events => throughput is 3.81E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.599685e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.590982e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.950677e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.929944e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.500010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.500320e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.626742e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.738271e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.496581e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.503432e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.674704e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635839e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.461510e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.484746e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.527679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.527720e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index f5978616e8..0666a67fd8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:27:09 +DATE: 2024-01-27_19:55:58 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4106s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2778s - [COUNTERS] Fortran MEs ( 1 ) : 4.1329s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2805s + [COUNTERS] Fortran MEs ( 1 ) : 4.1884s for 8192 events => throughput is 1.96E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.3910s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2748s - [COUNTERS] Fortran MEs ( 1 ) : 4.1162s for 8192 events => throughput is 1.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4168s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2810s + [COUNTERS] Fortran MEs ( 1 ) : 4.1358s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.7414s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9612s - [COUNTERS] Fortran MEs ( 1 ) : 45.7803s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.6464s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9568s + [COUNTERS] Fortran MEs ( 1 ) : 45.6896s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.0398s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6040s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4358s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.1405s + [COUNTERS] Fortran Overhead ( 0 ) : 4.7023s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.4382s for 8192 events => throughput is 1.85E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 55.1214s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2405s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.8809s for 90112 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 55.0635s + [COUNTERS] Fortran Overhead ( 0 ) : 6.2258s + [COUNTERS] CudaCpp MEs ( 2 ) : 48.8377s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.904356e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.896262e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.897196e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.907010e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.6768s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4567s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2201s for 8192 events => throughput is 3.69E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.6732s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4394s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2338s for 8192 events => throughput is 3.67E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.5645s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0918s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.4727s for 90112 events => throughput is 3.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.6839s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1051s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.5787s for 90112 events => throughput is 3.67E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.796653e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.776392e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.797971e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.785912e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2233s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2282s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9951s for 8192 events => throughput is 8.23E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.1949s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2253s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9695s for 8192 events => throughput is 8.45E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.6725s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9431s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7294s for 90112 events => throughput is 8.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.6246s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8910s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.7336s for 90112 events => throughput is 8.40E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.734061e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.640527e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.702913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.678863e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.9431s + [COUNTERS] PROGRAM TOTAL : 1.9438s [COUNTERS] Fortran Overhead ( 0 ) : 1.0984s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8447s for 8192 events => throughput is 9.70E+03 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8454s for 8192 events => throughput is 9.69E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.1703s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7526s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4177s for 90112 events => throughput is 9.57E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.1386s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7687s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.3699s for 90112 events => throughput is 9.62E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.930286e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.922714e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.925319e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.865641e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4040s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3330s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0710s for 8192 events => throughput is 7.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.4280s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3589s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0691s for 8192 events => throughput is 7.66E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.7788s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9945s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.7843s for 90112 events => throughput is 7.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.7988s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9921s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.8068s for 90112 events => throughput is 7.63E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.732657e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.748515e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.743920e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.727003e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8094s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7765s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 8192 events => throughput is 2.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8110s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7780s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7913s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3635s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7891s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3624s for 90112 events => throughput is 2.49E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.296986e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.302267e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.531398e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.526771e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106777e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.095977e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.144518e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.154067e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.100756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.104340e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.161493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.167057e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.110035e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099706e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425529e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.436803e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 429acdedda..52fdbbde9d 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -2,8 +2,8 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:32:53 +DATE: 2024-01-27_20:01:42 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.5184s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4509s - [COUNTERS] Fortran MEs ( 1 ) : 95.0675s for 8192 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.6185s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4524s + [COUNTERS] Fortran MEs ( 1 ) : 95.1661s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.4670s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4447s - [COUNTERS] Fortran MEs ( 1 ) : 95.0223s for 8192 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.6420s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4530s + [COUNTERS] Fortran MEs ( 1 ) : 95.1890s for 8192 events => throughput is 8.61E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1051.7230s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1339s - [COUNTERS] Fortran MEs ( 1 ) : 1047.5891s for 90112 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1050.9211s + [COUNTERS] Fortran Overhead ( 0 ) : 4.0917s + [COUNTERS] Fortran MEs ( 1 ) : 1046.8295s for 90112 events => throughput is 8.61E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 211.8832s - [COUNTERS] Fortran Overhead ( 0 ) : 96.0026s - [COUNTERS] CudaCpp MEs ( 2 ) : 115.8806s for 8192 events => throughput is 7.07E+01 events/s + [COUNTERS] PROGRAM TOTAL : 214.0886s + [COUNTERS] Fortran Overhead ( 0 ) : 96.2530s + [COUNTERS] CudaCpp MEs ( 2 ) : 117.8356s for 8192 events => throughput is 6.95E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1351.3046s - [COUNTERS] Fortran Overhead ( 0 ) : 101.1941s - [COUNTERS] CudaCpp MEs ( 2 ) : 1250.1105s for 90112 events => throughput is 7.21E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1364.9336s + [COUNTERS] Fortran Overhead ( 0 ) : 99.9906s + [COUNTERS] CudaCpp MEs ( 2 ) : 1264.9430s for 90112 events => throughput is 7.12E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.478531e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.423695e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.393754e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.101960e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 107.0674s - [COUNTERS] Fortran Overhead ( 0 ) : 49.6711s - [COUNTERS] CudaCpp MEs ( 2 ) : 57.3962s for 8192 events => throughput is 1.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.5716s + [COUNTERS] Fortran Overhead ( 0 ) : 49.8673s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.7042s for 8192 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 689.5263s - [COUNTERS] Fortran Overhead ( 0 ) : 53.3356s - [COUNTERS] CudaCpp MEs ( 2 ) : 636.1907s for 90112 events => throughput is 1.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 690.6820s + [COUNTERS] Fortran Overhead ( 0 ) : 53.5916s + [COUNTERS] CudaCpp MEs ( 2 ) : 637.0904s for 90112 events => throughput is 1.41E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.669367e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.659541e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.654299e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.662982e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 49.8853s - [COUNTERS] Fortran Overhead ( 0 ) : 23.2240s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.6613s for 8192 events => throughput is 3.07E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.0765s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3145s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.7620s for 8192 events => throughput is 3.06E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 320.8191s - [COUNTERS] Fortran Overhead ( 0 ) : 26.7578s - [COUNTERS] CudaCpp MEs ( 2 ) : 294.0612s for 90112 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 320.2875s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1185s + [COUNTERS] CudaCpp MEs ( 2 ) : 293.1690s for 90112 events => throughput is 3.07E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.617508e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601129e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.625262e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617049e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 44.0733s - [COUNTERS] Fortran Overhead ( 0 ) : 20.2334s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.8398s for 8192 events => throughput is 3.44E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.0775s + [COUNTERS] Fortran Overhead ( 0 ) : 20.2382s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8393s for 8192 events => throughput is 3.44E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 284.5750s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7496s - [COUNTERS] CudaCpp MEs ( 2 ) : 260.8255s for 90112 events => throughput is 3.45E+02 events/s + [COUNTERS] PROGRAM TOTAL : 288.1186s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9142s + [COUNTERS] CudaCpp MEs ( 2 ) : 264.2044s for 90112 events => throughput is 3.41E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.130335e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.168880e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.157512e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.113271e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 45.4574s - [COUNTERS] Fortran Overhead ( 0 ) : 22.3247s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.1327s for 8192 events => throughput is 3.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.3217s + [COUNTERS] Fortran Overhead ( 0 ) : 22.3591s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.9626s for 8192 events => throughput is 3.57E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 280.8463s - [COUNTERS] Fortran Overhead ( 0 ) : 25.8922s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.9542s for 90112 events => throughput is 3.53E+02 events/s + [COUNTERS] PROGRAM TOTAL : 280.5957s + [COUNTERS] Fortran Overhead ( 0 ) : 26.0901s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.5056s for 90112 events => throughput is 3.54E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.776742e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.791116e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.801039e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.767212e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 4.1976s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1190s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0786s for 8192 events => throughput is 7.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2015s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1184s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0831s for 8192 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 18.6950s - [COUNTERS] Fortran Overhead ( 0 ) : 6.7713s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9237s for 90112 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.6414s + [COUNTERS] Fortran Overhead ( 0 ) : 6.7249s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9165s for 90112 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.531209e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.531134e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.272145e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.257699e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.199498e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.280873e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.555619e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.583682e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.264388e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.232524e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.490561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.452604e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.258329e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.262155e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.243688e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.246198e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 3091cfced1..fe3eae6140 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_01:58:10 +DATE: 2024-01-27_21:27:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.5916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4454s - [COUNTERS] Fortran MEs ( 1 ) : 95.1462s for 8192 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.7877s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4488s + [COUNTERS] Fortran MEs ( 1 ) : 95.3389s for 8192 events => throughput is 8.59E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.5628s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4434s - [COUNTERS] Fortran MEs ( 1 ) : 95.1194s for 8192 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.0064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4553s + [COUNTERS] Fortran MEs ( 1 ) : 95.5511s for 8192 events => throughput is 8.57E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1054.9783s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1232s - [COUNTERS] Fortran MEs ( 1 ) : 1050.8551s for 90112 events => throughput is 8.58E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1055.8027s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1162s + [COUNTERS] Fortran MEs ( 1 ) : 1051.6865s for 90112 events => throughput is 8.57E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 191.8272s - [COUNTERS] Fortran Overhead ( 0 ) : 88.5499s - [COUNTERS] CudaCpp MEs ( 2 ) : 103.2773s for 8192 events => throughput is 7.93E+01 events/s + [COUNTERS] PROGRAM TOTAL : 192.0455s + [COUNTERS] Fortran Overhead ( 0 ) : 88.8202s + [COUNTERS] CudaCpp MEs ( 2 ) : 103.2254s for 8192 events => throughput is 7.94E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1227.9199s - [COUNTERS] Fortran Overhead ( 0 ) : 92.2084s - [COUNTERS] CudaCpp MEs ( 2 ) : 1135.7115s for 90112 events => throughput is 7.93E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1232.5237s + [COUNTERS] Fortran Overhead ( 0 ) : 92.7257s + [COUNTERS] CudaCpp MEs ( 2 ) : 1139.7980s for 90112 events => throughput is 7.91E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.299750e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.282100e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.305787e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.305531e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 48.8237s - [COUNTERS] Fortran Overhead ( 0 ) : 23.0435s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.7802s for 8192 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.0764s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2911s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.7852s for 8192 events => throughput is 3.18E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 309.9872s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8030s - [COUNTERS] CudaCpp MEs ( 2 ) : 283.1842s for 90112 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 312.6919s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8578s + [COUNTERS] CudaCpp MEs ( 2 ) : 285.8342s for 90112 events => throughput is 3.15E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.631881e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.601653e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.602316e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.596532e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 25.2458s - [COUNTERS] Fortran Overhead ( 0 ) : 11.9347s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.3111s for 8192 events => throughput is 6.15E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.2888s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8515s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.4373s for 8192 events => throughput is 6.10E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 162.2911s - [COUNTERS] Fortran Overhead ( 0 ) : 15.4533s - [COUNTERS] CudaCpp MEs ( 2 ) : 146.8378s for 90112 events => throughput is 6.14E+02 events/s + [COUNTERS] PROGRAM TOTAL : 162.4319s + [COUNTERS] Fortran Overhead ( 0 ) : 15.3978s + [COUNTERS] CudaCpp MEs ( 2 ) : 147.0342s for 90112 events => throughput is 6.13E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.251206e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.206264e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.245377e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.127912e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 22.5039s - [COUNTERS] Fortran Overhead ( 0 ) : 10.4156s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0883s for 8192 events => throughput is 6.78E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.4910s + [COUNTERS] Fortran Overhead ( 0 ) : 10.5292s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9618s for 8192 events => throughput is 6.85E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 146.3643s - [COUNTERS] Fortran Overhead ( 0 ) : 14.1002s - [COUNTERS] CudaCpp MEs ( 2 ) : 132.2641s for 90112 events => throughput is 6.81E+02 events/s + [COUNTERS] PROGRAM TOTAL : 145.3442s + [COUNTERS] Fortran Overhead ( 0 ) : 14.0492s + [COUNTERS] CudaCpp MEs ( 2 ) : 131.2950s for 90112 events => throughput is 6.86E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.100603e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.195828e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.238263e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.228494e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 22.5776s - [COUNTERS] Fortran Overhead ( 0 ) : 11.2452s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.3325s for 8192 events => throughput is 7.23E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.9016s + [COUNTERS] Fortran Overhead ( 0 ) : 11.4949s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.4068s for 8192 events => throughput is 7.18E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 140.6031s - [COUNTERS] Fortran Overhead ( 0 ) : 15.1098s - [COUNTERS] CudaCpp MEs ( 2 ) : 125.4933s for 90112 events => throughput is 7.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 141.4330s + [COUNTERS] Fortran Overhead ( 0 ) : 14.9090s + [COUNTERS] CudaCpp MEs ( 2 ) : 126.5241s for 90112 events => throughput is 7.12E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.608240e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.594821e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.521625e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.588228e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 2.4649s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9743s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4905s for 8192 events => throughput is 1.67E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4748s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9740s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5008s for 8192 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 11.0984s - [COUNTERS] Fortran Overhead ( 0 ) : 5.5613s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.5371s for 90112 events => throughput is 1.63E+04 events/s + [COUNTERS] PROGRAM TOTAL : 10.9971s + [COUNTERS] Fortran Overhead ( 0 ) : 5.5646s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4326s for 90112 events => throughput is 1.66E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642284e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.639284e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624354e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.641835e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.288681e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.349279e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.340792e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430829e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.346593e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.310161e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.342416e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.351327e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.314317e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298097e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.422033e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.390837e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 63e196188b..e6abf766e6 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_03:02:40 +DATE: 2024-01-27_22:32:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.5897s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4484s - [COUNTERS] Fortran MEs ( 1 ) : 95.1413s for 8192 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.7215s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4454s + [COUNTERS] Fortran MEs ( 1 ) : 95.2761s for 8192 events => throughput is 8.60E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.9550s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4482s - [COUNTERS] Fortran MEs ( 1 ) : 95.5068s for 8192 events => throughput is 8.58E+01 events/s + [COUNTERS] PROGRAM TOTAL : 95.7236s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4504s + [COUNTERS] Fortran MEs ( 1 ) : 95.2731s for 8192 events => throughput is 8.60E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1052.1674s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1033s - [COUNTERS] Fortran MEs ( 1 ) : 1048.0641s for 90112 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1054.9291s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1216s + [COUNTERS] Fortran MEs ( 1 ) : 1050.8075s for 90112 events => throughput is 8.58E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 211.1386s - [COUNTERS] Fortran Overhead ( 0 ) : 97.2926s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.8460s for 8192 events => throughput is 7.20E+01 events/s + [COUNTERS] PROGRAM TOTAL : 210.1883s + [COUNTERS] Fortran Overhead ( 0 ) : 97.1779s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.0104s for 8192 events => throughput is 7.25E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1351.7559s - [COUNTERS] Fortran Overhead ( 0 ) : 100.8234s - [COUNTERS] CudaCpp MEs ( 2 ) : 1250.9325s for 90112 events => throughput is 7.20E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1342.5497s + [COUNTERS] Fortran Overhead ( 0 ) : 100.9547s + [COUNTERS] CudaCpp MEs ( 2 ) : 1241.5950s for 90112 events => throughput is 7.26E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.502446e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.476387e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.479145e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492210e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 109.1320s - [COUNTERS] Fortran Overhead ( 0 ) : 50.5216s - [COUNTERS] CudaCpp MEs ( 2 ) : 58.6104s for 8192 events => throughput is 1.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 110.3661s + [COUNTERS] Fortran Overhead ( 0 ) : 51.0384s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.3277s for 8192 events => throughput is 1.38E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 701.1097s - [COUNTERS] Fortran Overhead ( 0 ) : 54.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 646.8302s for 90112 events => throughput is 1.39E+02 events/s + [COUNTERS] PROGRAM TOTAL : 707.2110s + [COUNTERS] Fortran Overhead ( 0 ) : 55.1542s + [COUNTERS] CudaCpp MEs ( 2 ) : 652.0568s for 90112 events => throughput is 1.38E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.636656e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635229e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635010e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.631046e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 47.2278s - [COUNTERS] Fortran Overhead ( 0 ) : 21.8795s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.3483s for 8192 events => throughput is 3.23E+02 events/s + [COUNTERS] PROGRAM TOTAL : 47.8962s + [COUNTERS] Fortran Overhead ( 0 ) : 22.3291s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.5671s for 8192 events => throughput is 3.20E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 308.5589s - [COUNTERS] Fortran Overhead ( 0 ) : 25.6558s - [COUNTERS] CudaCpp MEs ( 2 ) : 282.9030s for 90112 events => throughput is 3.19E+02 events/s + [COUNTERS] PROGRAM TOTAL : 307.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4999s + [COUNTERS] CudaCpp MEs ( 2 ) : 281.8685s for 90112 events => throughput is 3.20E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.824293e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.835214e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.845973e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.800587e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 42.5262s - [COUNTERS] Fortran Overhead ( 0 ) : 19.1736s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.3526s for 8192 events => throughput is 3.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.0423s + [COUNTERS] Fortran Overhead ( 0 ) : 19.2394s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.8029s for 8192 events => throughput is 3.59E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 276.3285s - [COUNTERS] Fortran Overhead ( 0 ) : 22.8039s - [COUNTERS] CudaCpp MEs ( 2 ) : 253.5246s for 90112 events => throughput is 3.55E+02 events/s + [COUNTERS] PROGRAM TOTAL : 275.6535s + [COUNTERS] Fortran Overhead ( 0 ) : 22.9689s + [COUNTERS] CudaCpp MEs ( 2 ) : 252.6846s for 90112 events => throughput is 3.57E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.315827e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.383037e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.342660e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.388150e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 44.1656s - [COUNTERS] Fortran Overhead ( 0 ) : 21.4926s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.6730s for 8192 events => throughput is 3.61E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.1798s + [COUNTERS] Fortran Overhead ( 0 ) : 21.7720s + [COUNTERS] CudaCpp MEs ( 2 ) : 22.4079s for 8192 events => throughput is 3.66E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 271.5361s - [COUNTERS] Fortran Overhead ( 0 ) : 25.1621s - [COUNTERS] CudaCpp MEs ( 2 ) : 246.3740s for 90112 events => throughput is 3.66E+02 events/s + [COUNTERS] PROGRAM TOTAL : 273.0007s + [COUNTERS] Fortran Overhead ( 0 ) : 25.3100s + [COUNTERS] CudaCpp MEs ( 2 ) : 247.6907s for 90112 events => throughput is 3.64E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.884301e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.889496e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.891512e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.892701e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 3.5335s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8612s for 8192 events => throughput is 9.51E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5395s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6766s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8629s for 8192 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 15.7822s - [COUNTERS] Fortran Overhead ( 0 ) : 6.3002s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4820s for 90112 events => throughput is 9.50E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.8536s + [COUNTERS] Fortran Overhead ( 0 ) : 6.3552s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4985s for 90112 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.442598e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.420712e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.084385e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.085781e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111822e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110953e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.156542e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.160771e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113092e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113389e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109796e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108659e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110480e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.656419e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.644429e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 18a212f5c3..8eabbec827 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,25 +15,25 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-26_00:31:25 +DATE: 2024-01-27_20:00:14 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3131s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2435s - [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2439s + [COUNTERS] Fortran MEs ( 1 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2330s - [COUNTERS] Fortran MEs ( 1 ) : 0.0705s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3054s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2351s + [COUNTERS] Fortran MEs ( 1 ) : 0.0703s for 8192 events => throughput is 1.16E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2442s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4822s - [COUNTERS] Fortran MEs ( 1 ) : 0.7620s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2457s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4837s + [COUNTERS] Fortran MEs ( 1 ) : 0.7619s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3921s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3155s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0767s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3152s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4324s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5851s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8472s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4447s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5965s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8482s for 90112 events => throughput is 1.06E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.071399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048805e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.068727e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069775e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3172s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2776s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0397s for 8192 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3352s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2925s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9768s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5408s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4360s for 90112 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9882s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4367s for 90112 events => throughput is 2.06E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.067409e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.077139e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.048112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.055483e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2851s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2615s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2622s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0237s for 8192 events => throughput is 3.46E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7839s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5289s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2550s for 90112 events => throughput is 3.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7885s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5334s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2551s for 90112 events => throughput is 3.53E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.399317e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447216e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.503492e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524839e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2793s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2587s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2826s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2619s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.97E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7452s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5225s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2227s for 90112 events => throughput is 4.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7547s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2241s for 90112 events => throughput is 4.02E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.854052e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.011517e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.940709e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937850e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3198s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2874s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0324s for 8192 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2980s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2688s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0292s for 8192 events => throughput is 2.81E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8630s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5384s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3247s for 90112 events => throughput is 2.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8787s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5475s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3312s for 90112 events => throughput is 2.72E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.752350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.707317e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.723924e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.724189e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6665s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6679s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9360s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9284s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.19E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9386s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.551656e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531327e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.992280e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.025533e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.371489e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.381686e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.506906e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.530559e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.393433e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.390744e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.788279e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.773323e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383321e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.392670e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.769455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.778464e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index f81f538e39..e7ce883183 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:31:55 +DATE: 2024-01-27_20:00:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3054s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2364s - [COUNTERS] Fortran MEs ( 1 ) : 0.0691s for 8192 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3058s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2368s + [COUNTERS] Fortran MEs ( 1 ) : 0.0690s for 8192 events => throughput is 1.19E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3002s + [COUNTERS] PROGRAM TOTAL : 0.3001s [COUNTERS] Fortran Overhead ( 0 ) : 0.2314s - [COUNTERS] Fortran MEs ( 1 ) : 0.0688s for 8192 events => throughput is 1.19E+05 events/s + [COUNTERS] Fortran MEs ( 1 ) : 0.0687s for 8192 events => throughput is 1.19E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2344s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4729s - [COUNTERS] Fortran MEs ( 1 ) : 0.7615s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2330s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4705s + [COUNTERS] Fortran MEs ( 1 ) : 0.7624s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3899s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3166s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0733s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3755s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3063s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0691s for 8192 events => throughput is 1.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3620s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6003s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7617s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3365s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5781s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7584s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.201826e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186203e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.203566e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.205535e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2639s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0236s for 8192 events => throughput is 3.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2887s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2652s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.48E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7927s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5347s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2579s for 90112 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7885s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5301s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2584s for 90112 events => throughput is 3.49E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.501400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410521e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.487643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.312954e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2644s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2519s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0125s for 8192 events => throughput is 6.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2510s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.65E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7527s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1430s for 90112 events => throughput is 6.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6598s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5246s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 90112 events => throughput is 6.66E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.944046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.502420e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.933575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518865e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2699s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2582s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0117s for 8192 events => throughput is 6.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2606s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2497s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0109s for 8192 events => throughput is 7.55E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6415s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1215s for 90112 events => throughput is 7.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6476s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5254s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1222s for 90112 events => throughput is 7.37E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.170126e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.140387e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.259977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.180727e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2699s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2547s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0152s for 8192 events => throughput is 5.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2742s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2583s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.15E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6930s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5258s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1672s for 90112 events => throughput is 5.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7363s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5620s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1743s for 90112 events => throughput is 5.17E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.209904e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.838710e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.116271e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.732174e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6657s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6651s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.56E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6668s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6662s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.53E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9408s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0126s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0061s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.40E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.849520e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.753799e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.485887e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.525254e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.841126e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.832384e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.716684e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.703610e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.852475e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.848857e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.797125e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.802905e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.386392e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.395978e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.948163e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.145333e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 655570da22..ecf11d905f 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2024-01-26_00:32:23 +DATE: 2024-01-27_20:01:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3048s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2353s - [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3155s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2413s + [COUNTERS] Fortran MEs ( 1 ) : 0.0742s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3071s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2376s - [COUNTERS] Fortran MEs ( 1 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3197s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2476s + [COUNTERS] Fortran MEs ( 1 ) : 0.0721s for 8192 events => throughput is 1.14E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2412s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4768s - [COUNTERS] Fortran MEs ( 1 ) : 0.7644s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2369s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4748s + [COUNTERS] Fortran MEs ( 1 ) : 0.7621s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,7 +134,7 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3930s + [COUNTERS] PROGRAM TOTAL : 0.3928s [COUNTERS] Fortran Overhead ( 0 ) : 0.3153s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0776s for 8192 events => throughput is 1.06E+05 events/s @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4382s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8557s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4489s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8546s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -182,12 +182,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.063006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.058266e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.063531e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.067039e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3167s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2775s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0392s for 8192 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3162s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2769s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0393s for 8192 events => throughput is 2.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9800s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5472s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4327s for 90112 events => throughput is 2.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0083s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5670s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4414s for 90112 events => throughput is 2.04E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -258,12 +258,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.019971e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.029172e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.997392e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.018217e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2625s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 8192 events => throughput is 3.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2614s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7804s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5288s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2516s for 90112 events => throughput is 3.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7845s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5315s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2530s for 90112 events => throughput is 3.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -334,12 +334,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.589769e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.609443e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.540136e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566388e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2781s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2776s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2578s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0199s for 8192 events => throughput is 4.12E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7435s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2170s for 90112 events => throughput is 4.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7504s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5324s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2180s for 90112 events => throughput is 4.13E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -410,12 +410,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.065243e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.031661e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.047006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094627e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2998s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2695s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0303s for 8192 events => throughput is 2.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2688s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0299s for 8192 events => throughput is 2.74E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8798s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5454s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3344s for 90112 events => throughput is 2.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8739s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5413s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3326s for 90112 events => throughput is 2.71E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -486,12 +486,12 @@ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.636835e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.587817e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.671197e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.653156e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6635s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6628s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.23E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6688s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6681s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,9 +547,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9357s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9282s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.9436s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9359s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** @@ -562,41 +562,41 @@ OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.571736e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531745e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.159114e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.050673e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.394463e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.398873e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.528264e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.543266e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383492e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.390552e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.837843e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.836975e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.384878e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.397476e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.781274e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780810e+07 ) sec^-1 TEST COMPLETED From 134d1260428bd5cfe3bc47508ce71de1524be467 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 28 Jan 2024 17:42:41 +0200 Subject: [PATCH 72/96] [jt774] first execution of all 78 tput tests on LUMI (on CPUs and AMD GPUs) - several failures in ggttq #806 Note: the test was done in two stages. First the build on the login node (~20h) and then the test on the worker node (~1h) (1) This is from the log of the first step, building on the login node (~20h) I did not want to waste the allocation of the GPU worker nodes to do what I can do on the login nodes. This took 20+ hours for C++/HIP builds (much more than for C++/CUDA builds at CERN), from 4pm to 1pm the next day STARTED AT Sat 27 Jan 2024 04:16:54 PM EET ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean -makeonly ENDED(1) AT Sun 28 Jan 2024 11:52:38 AM EET [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean -makeonly ENDED(2) AT Sun 28 Jan 2024 12:15:33 PM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean -makeonly ENDED(3) AT Sun 28 Jan 2024 12:22:49 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst -makeonly ENDED(4) AT Sun 28 Jan 2024 12:23:44 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst -makeonly ENDED(5) AT Sun 28 Jan 2024 12:24:39 PM EET [Status=0] (2) This is from the log of the second step, testing on the worker node (~1h) STARTED AT Sun 28 Jan 2024 12:49:04 PM EET ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Sun 28 Jan 2024 01:25:16 PM EET [Status=2] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Sun 28 Jan 2024 01:37:05 PM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Sun 28 Jan 2024 01:50:42 PM EET [Status=2] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Sun 28 Jan 2024 01:53:22 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Sun 28 Jan 2024 01:54:37 PM EET [Status=0] ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt:ERROR! Fortran calculation (F77/CUDA) crashed --- .../log_eemumu_mad_d_inl0_hrd0.txt | 227 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 234 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 213 ++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 210 +++++--------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 229 ++++++--------- .../log_eemumu_mad_d_inl0_hrd1.txt | 227 ++++++--------- .../log_eemumu_mad_d_inl1_hrd0.txt | 225 ++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 225 ++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 239 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 246 +++++++--------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 227 ++++++--------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 222 +++++--------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 241 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 239 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 237 ++++++--------- .../log_eemumu_mad_f_inl1_hrd1.txt | 237 ++++++--------- .../log_eemumu_mad_m_inl0_hrd0.txt | 227 ++++++--------- .../log_eemumu_mad_m_inl0_hrd1.txt | 227 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0.txt | 227 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 234 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 213 ++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 210 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 229 ++++++--------- .../log_ggtt_mad_d_inl0_hrd1.txt | 227 ++++++--------- .../log_ggtt_mad_d_inl1_hrd0.txt | 225 ++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 225 ++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 245 +++++++--------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 252 +++++++--------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 239 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 228 +++++---------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 247 +++++++--------- .../log_ggtt_mad_f_inl0_hrd1.txt | 245 +++++++--------- .../log_ggtt_mad_f_inl1_hrd0.txt | 239 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 239 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 225 ++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 225 ++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 258 +++++++---------- .../log_ggttg_mad_d_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 264 +++++++---------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 272 +++++++---------- .../log_ggttg_mad_f_inl0_hrd1.txt | 264 +++++++---------- .../log_ggttg_mad_m_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttg_mad_m_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 258 +++++++---------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 234 ++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 228 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 253 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 252 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 252 +++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 266 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 274 +++++++----------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 258 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 244 ++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 269 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 266 +++++++---------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 270 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 270 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 246 +++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 246 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 258 +++++++---------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 266 +++++++---------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 274 +++++++----------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 266 +++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 250 +++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 255 +++++----------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 265 +++++------------ .../log_gqttq_mad_d_inl0_hrd1.txt | 255 +++++----------- .../log_gqttq_mad_f_inl0_hrd0.txt | 255 +++++----------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 265 +++++------------ .../log_gqttq_mad_f_inl0_hrd1.txt | 255 +++++----------- .../log_gqttq_mad_m_inl0_hrd0.txt | 255 +++++----------- .../log_gqttq_mad_m_inl0_hrd1.txt | 255 +++++----------- 78 files changed, 7406 insertions(+), 11668 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 4330c287c1..c73ffa26a2 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_18:27:57 +DATE: 2024-01-28_13:07:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.460884e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.590509e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.143394e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.806184 sec - 2,658,194,401 cycles # 2.838 GHz - 4,112,400,936 instructions # 1.55 insn per cycle - 1.130813294 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.187163e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.113051e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.341239e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.085142 sec + 15,401,484,650 cycles:u # 2.956 GHz (75.00%) + 53,460,298 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.25%) + 6,922,628,832 stalled-cycles-backend:u # 44.95% backend cycles idle (75.11%) + 11,641,711,170 instructions:u # 0.76 insn per cycle + # 0.59 stalled cycles per insn (74.84%) + 5.625918161 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.039825e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.208009e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.208009e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.449240 sec - 19,506,969,820 cycles # 3.023 GHz - 46,933,193,939 instructions # 2.41 insn per cycle - 6.458664542 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.250244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.427953e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427953e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.755044 sec + 19,517,650,978 cycles:u # 3.375 GHz (74.97%) + 52,736,956 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.91%) + 64,319,790 stalled-cycles-backend:u # 0.33% backend cycles idle (74.91%) + 47,058,402,875 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 5.786528246 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.633214e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.140541e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.140541e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.254028 sec - 12,841,479,967 cycles # 3.015 GHz - 31,185,544,346 instructions # 2.43 insn per cycle - 4.269731971 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.935680e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.431114e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.431114e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.970217 sec + 13,235,454,941 cycles:u # 3.310 GHz (74.99%) + 49,338,655 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.99%) + 973,459,800 stalled-cycles-backend:u # 7.35% backend cycles idle (74.99%) + 31,182,239,136 instructions:u # 2.36 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 4.002630879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.986234e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781810e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781810e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.574760 sec - 10,055,647,274 cycles # 2.809 GHz - 19,481,127,196 instructions # 1.94 insn per cycle - 3.590049555 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.652416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.531228e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.531228e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.091469 sec + 10,182,202,026 cycles:u # 3.264 GHz (74.89%) + 48,463,224 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.87%) + 450,467,756 stalled-cycles-backend:u # 4.42% backend cycles idle (75.00%) + 19,351,447,734 instructions:u # 1.90 insn per cycle + # 0.02 stalled cycles per insn (75.13%) + 3.123604900 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.155866e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.107363e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.107363e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.317920 sec - 9,570,235,391 cycles # 2.879 GHz - 18,943,127,668 instructions # 1.98 insn per cycle - 3.334912589 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.937915e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.653688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.653688e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.645707 sec - 8,178,608,809 cycles # 2.240 GHz - 15,512,146,730 instructions # 1.90 insn per cycle - 3.661596044 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 53b275c90b..2ece6f60cd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:20:43 +DATE: 2024-01-28_13:45:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.561948e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.477751e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.477751e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.278018 sec - 7,491,205,159 cycles # 2.971 GHz - 13,264,724,403 instructions # 1.77 insn per cycle - 2.579981356 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.489385e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.351558e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.351558e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.536230 sec + 18,358,209,500 cycles:u # 3.296 GHz (74.96%) + 121,997,790 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.90%) + 6,997,087,110 stalled-cycles-backend:u # 38.11% backend cycles idle (74.96%) + 17,155,260,780 instructions:u # 0.93 insn per cycle + # 0.41 stalled cycles per insn (75.00%) + 5.598707861 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.004091e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.162639e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.162639e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.868532 sec - 20,782,502,647 cycles # 3.023 GHz - 47,159,656,287 instructions # 2.27 insn per cycle - 6.876357714 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.233749e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.406395e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.406395e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.930356 sec + 19,949,452,995 cycles:u # 3.342 GHz (74.94%) + 52,759,407 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.94%) + 115,219,441 stalled-cycles-backend:u # 0.58% backend cycles idle (74.96%) + 47,211,806,753 instructions:u # 2.37 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 5.971481804 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.539557e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.982485e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.982485e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.685255 sec - 14,050,607,376 cycles # 2.995 GHz - 32,025,794,268 instructions # 2.28 insn per cycle - 4.692691527 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.865640e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.331378e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.331378e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.232961 sec + 13,973,241,123 cycles:u # 3.271 GHz (74.91%) + 50,351,129 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.91%) + 1,042,859,941 stalled-cycles-backend:u # 7.46% backend cycles idle (74.94%) + 31,893,025,253 instructions:u # 2.28 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 4.276043039 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.907018e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.613110e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.613110e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.915539 sec - 11,287,161,118 cycles # 2.878 GHz - 20,844,870,086 instructions # 1.85 insn per cycle - 3.923278878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.549092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345903e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345903e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.336339 sec + 10,839,641,195 cycles:u # 3.211 GHz (74.91%) + 49,204,874 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.88%) + 476,585,531 stalled-cycles-backend:u # 4.40% backend cycles idle (74.97%) + 20,620,320,024 instructions:u # 1.90 insn per cycle + # 0.02 stalled cycles per insn (75.09%) + 3.379348572 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003365e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.805935e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.805935e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.758349 sec - 10,888,847,708 cycles # 2.893 GHz - 20,303,188,846 instructions # 1.86 insn per cycle - 3.766056385 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.761057e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.355372e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.355372e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.206753 sec - 9,459,711,024 cycles # 2.252 GHz - 16,668,908,023 instructions # 1.76 insn per cycle - 4.214232398 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 9ce6b6fe11..baa89d7a03 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:34:14 +DATE: 2024-01-28_13:55:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.457550e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.577422e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.129022e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.146697e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.104851e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.333756e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.335024 sec - 4,629,417,841 cycles # 2.946 GHz - 7,166,523,600 instructions # 1.55 insn per cycle - 1.629362314 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 4.653795 sec + 15,325,067,781 cycles:u # 3.273 GHz (75.06%) + 53,542,980 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.08%) + 6,940,523,786 stalled-cycles-backend:u # 45.29% backend cycles idle (75.08%) + 11,587,453,443 instructions:u # 0.76 insn per cycle + # 0.60 stalled cycles per insn (75.05%) + 4.706698959 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.039853e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.208997e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.208997e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.250607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.427612e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427612e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.812043 sec - 20,582,176,590 cycles # 3.020 GHz - 47,035,833,730 instructions # 2.29 insn per cycle - 6.818449406 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.756745 sec + 19,505,733,856 cycles:u # 3.372 GHz (74.98%) + 52,621,983 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.97%) + 64,317,116 stalled-cycles-backend:u # 0.33% backend cycles idle (74.98%) + 47,050,652,034 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 5.788198860 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.629024e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.135452e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.135452e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.919218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.430888e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430888e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.620925 sec - 13,913,704,075 cycles # 3.008 GHz - 31,185,324,761 instructions # 2.24 insn per cycle - 4.627444403 seconds time elapsed +TOTAL : 4.000634 sec + 13,345,658,667 cycles:u # 3.312 GHz (74.99%) + 50,131,805 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.99%) + 998,870,131 stalled-cycles-backend:u # 7.48% backend cycles idle (74.99%) + 31,165,536,709 instructions:u # 2.34 insn per cycle + # 0.03 stalled cycles per insn (75.00%) + 4.032008178 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.989875e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.784851e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.784851e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.655792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.530860e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.530860e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.941176 sec - 11,133,646,215 cycles # 2.821 GHz - 19,381,056,872 instructions # 1.74 insn per cycle - 3.947558254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.092974 sec + 10,173,625,147 cycles:u # 3.259 GHz (74.78%) + 47,880,835 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.78%) + 452,668,828 stalled-cycles-backend:u # 4.45% backend cycles idle (74.93%) + 19,370,864,981 instructions:u # 1.90 insn per cycle + # 0.02 stalled cycles per insn (75.06%) + 3.124544479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.141027e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.094342e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.094342e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.715294 sec - 10,784,366,987 cycles # 2.900 GHz - 18,647,015,795 instructions # 1.73 insn per cycle - 3.721294535 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.936713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.650521e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.650521e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.022118 sec - 9,287,587,077 cycles # 2.307 GHz - 15,211,770,176 instructions # 1.64 insn per cycle - 4.028347644 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 836acf7957..536dc86c3a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:30:54 +DATE: 2024-01-28_13:53:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.470668e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.588254e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.100392e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.979876 sec - 3,586,199,558 cycles # 2.963 GHz - 7,128,330,253 instructions # 1.99 insn per cycle - 1.267817011 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 46,495,171 cycles:u # 0.560 GHz (71.08%) + 42,178 stalled-cycles-frontend:u # 0.09% frontend cycles idle (71.08%) + 540,867 stalled-cycles-backend:u # 1.16% backend cycles idle (71.78%) + 37,772,172 instructions:u # 0.81 insn per cycle + # 0.01 stalled cycles per insn (72.86%) + 0.084519732 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.014231e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178496e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.178496e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.613489 sec - 19,532,347,142 cycles # 2.952 GHz - 46,936,306,541 instructions # 2.40 insn per cycle - 6.620356394 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted + 41,203,264 cycles:u # 2.003 GHz (61.15%) + 56,042 stalled-cycles-frontend:u # 0.14% frontend cycles idle (61.15%) + 362,269 stalled-cycles-backend:u # 0.88% backend cycles idle (61.52%) + 48,313,050 instructions:u # 1.17 insn per cycle + # 0.01 stalled cycles per insn (74.89%) + 0.021587347 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.633234e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.136276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.136276e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.251400 sec - 12,835,866,498 cycles # 3.016 GHz - 31,183,814,813 instructions # 2.43 insn per cycle - 4.257984131 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted + 53,509,383 cycles:u # 2.609 GHz (61.03%) + 45,277 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.03%) + 627,896 stalled-cycles-backend:u # 1.17% backend cycles idle (61.03%) + 44,491,939 instructions:u # 0.83 insn per cycle + # 0.01 stalled cycles per insn (61.50%) + 0.021612075 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.037351e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.863431e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.863431e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.486720 sec - 10,078,775,566 cycles # 2.886 GHz - 19,479,100,383 instructions # 1.93 insn per cycle - 3.493126449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted + 51,566,973 cycles:u # 2.530 GHz (60.78%) + 45,600 stalled-cycles-frontend:u # 0.09% frontend cycles idle (60.78%) + 606,684 stalled-cycles-backend:u # 1.18% backend cycles idle (60.78%) + 45,251,251 instructions:u # 0.88 insn per cycle + # 0.01 stalled cycles per insn (63.00%) + 0.021400674 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.154891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.100039e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.100039e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.319080 sec - 9,600,829,018 cycles # 2.888 GHz - 18,941,966,116 instructions # 1.97 insn per cycle - 3.325208400 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.935914e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.655345e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.655345e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.654207 sec - 8,183,104,435 cycles # 2.238 GHz - 15,511,507,696 instructions # 1.90 insn per cycle - 3.660556748 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 14c7a5e7c6..da2e035b05 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:27:32 +DATE: 2024-01-28_13:51:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.784353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.530206e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.003918e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.937918 sec - 6,278,717,730 cycles # 2.894 GHz - 11,472,658,869 instructions # 1.83 insn per cycle - 2.226188088 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.528826e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.088503e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.316519e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.364210 sec + 17,868,547,478 cycles:u # 3.311 GHz (74.97%) + 119,610,671 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.97%) + 6,902,829,832 stalled-cycles-backend:u # 38.63% backend cycles idle (74.94%) + 16,786,814,157 instructions:u # 0.94 insn per cycle + # 0.41 stalled cycles per insn (74.96%) + 5.413225567 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.039291e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.207939e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.207939e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.454604 sec - 19,532,999,538 cycles # 3.024 GHz - 46,935,481,953 instructions # 2.40 insn per cycle - 6.461056975 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.249977e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.427403e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.427403e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.760731 sec + 19,510,622,559 cycles:u # 3.370 GHz (74.99%) + 53,071,491 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.99%) + 61,693,741 stalled-cycles-backend:u # 0.32% backend cycles idle (74.99%) + 47,039,619,399 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 5.792220139 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.631706e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.134724e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.134724e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.254773 sec - 12,810,681,861 cycles # 3.007 GHz - 31,183,005,971 instructions # 2.43 insn per cycle - 4.260937327 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.917858e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.433331e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.433331e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.002580 sec + 13,351,914,529 cycles:u # 3.312 GHz (75.00%) + 46,277,153 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.00%) + 1,061,232,155 stalled-cycles-backend:u # 7.95% backend cycles idle (75.00%) + 31,154,424,323 instructions:u # 2.33 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 4.033810883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.047550e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.879066e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.879066e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.473336 sec - 10,042,042,727 cycles # 2.887 GHz - 19,480,063,297 instructions # 1.94 insn per cycle - 3.479649151 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.653997e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.532085e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.532085e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.094943 sec + 10,158,909,804 cycles:u # 3.252 GHz (74.91%) + 47,904,950 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.93%) + 453,479,389 stalled-cycles-backend:u # 4.46% backend cycles idle (74.93%) + 19,410,239,671 instructions:u # 1.91 insn per cycle + # 0.02 stalled cycles per insn (74.92%) + 3.126353323 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.111127e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.042846e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.042846e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.382998 sec - 9,611,186,335 cycles # 2.840 GHz - 18,944,470,603 instructions # 1.97 insn per cycle - 3.389579987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.931654e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.635550e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.635550e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.662797 sec - 8,145,684,875 cycles # 2.221 GHz - 15,512,034,536 instructions # 1.90 insn per cycle - 3.669369470 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index f52d4e00ed..441b1bb6f4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_18:28:32 +DATE: 2024-01-28_13:08:02 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.471911e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.612195e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.193653e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.671427 sec - 2,669,234,037 cycles # 2.944 GHz - 4,137,486,194 instructions # 1.55 insn per cycle - 0.981732984 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.750096e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.589937e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.910057e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.645918 sec + 15,351,943,429 cycles:u # 3.290 GHz (75.00%) + 53,595,143 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.02%) + 6,958,307,311 stalled-cycles-backend:u # 45.33% backend cycles idle (74.92%) + 11,532,854,200 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (74.87%) + 4.697581630 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.106630e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.298019e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.298019e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.087988 sec - 18,395,527,837 cycles # 3.019 GHz - 44,716,201,853 instructions # 2.43 insn per cycle - 6.096178198 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.322984e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.523214e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.523214e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.479199 sec + 18,560,763,128 cycles:u # 3.371 GHz (74.95%) + 52,081,225 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.02%) + 62,524,949 stalled-cycles-backend:u # 0.34% backend cycles idle (75.02%) + 44,776,264,781 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 5.509932315 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.696940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.245448e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.245448e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.105272 sec - 12,425,142,934 cycles # 3.023 GHz - 30,107,655,240 instructions # 2.42 insn per cycle - 4.121731739 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.012355e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.562911e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.562911e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.845256 sec + 12,804,475,737 cycles:u # 3.306 GHz (75.01%) + 50,272,921 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.01%) + 87,328,770 stalled-cycles-backend:u # 0.68% backend cycles idle (75.01%) + 30,087,742,697 instructions:u # 2.35 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.877788096 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.029464e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.852722e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.852722e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.502232 sec - 10,140,699,058 cycles # 2.893 GHz - 19,115,428,299 instructions # 1.89 insn per cycle - 3.516474054 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.598596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.426861e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.426861e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.141732 sec + 10,294,444,223 cycles:u # 3.247 GHz (75.02%) + 44,149,693 stalled-cycles-frontend:u # 0.43% frontend cycles idle (75.02%) + 284,584,719 stalled-cycles-backend:u # 2.76% backend cycles idle (75.02%) + 19,044,099,895 instructions:u # 1.85 insn per cycle + # 0.01 stalled cycles per insn (74.91%) + 3.173982174 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.193891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.198028e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.198028e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.264064 sec - 9,475,807,672 cycles # 2.898 GHz - 18,489,716,208 instructions # 1.95 insn per cycle - 3.281752367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.330207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.439946e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.439946e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.105240 sec - 7,166,370,615 cycles # 2.304 GHz - 13,864,882,407 instructions # 1.93 insn per cycle - 3.117484360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 2449bf3ae3..d60d821d9f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:09:32 +DATE: 2024-01-28_13:31:07 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.458496e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.569270e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.116381e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677068 sec - 2,678,926,612 cycles # 2.934 GHz - 4,161,327,353 instructions # 1.55 insn per cycle - 0.970928697 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.321527e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.099297e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.328708e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.668893 sec + 15,347,165,163 cycles:u # 3.281 GHz (75.06%) + 53,650,593 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.11%) + 6,940,894,773 stalled-cycles-backend:u # 45.23% backend cycles idle (75.00%) + 11,573,190,238 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (74.94%) + 4.722818283 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.410042e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.740847e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.740847e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.871448 sec - 14,605,683,453 cycles # 3.002 GHz - 36,698,911,392 instructions # 2.51 insn per cycle - 4.878202601 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.781161e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.162763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.162763e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.256323 sec + 14,257,338,770 cycles:u # 3.327 GHz (74.99%) + 54,041,283 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.99%) + 501,869,670 stalled-cycles-backend:u # 3.52% backend cycles idle (74.99%) + 36,747,633,350 instructions:u # 2.58 insn per cycle + # 0.01 stalled cycles per insn (75.00%) + 4.288355678 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.076022e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.960696e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.960696e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.429016 sec - 10,365,417,662 cycles # 3.018 GHz - 24,752,936,476 instructions # 2.39 insn per cycle - 3.436031943 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.395527e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.236314e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.236314e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.348575 sec + 11,075,223,056 cycles:u # 3.279 GHz (74.94%) + 49,913,517 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.92%) + 64,007,549 stalled-cycles-backend:u # 0.58% backend cycles idle (74.92%) + 24,757,662,826 instructions:u # 2.24 insn per cycle + # 0.00 stalled cycles per insn (74.90%) + 3.381886159 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.353664e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.514455e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.514455e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.070316 sec - 8,872,607,166 cycles # 2.884 GHz - 16,955,269,127 instructions # 1.91 insn per cycle - 3.077512547 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.003345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.172522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.172522e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.815790 sec + 9,191,114,084 cycles:u # 3.230 GHz (74.98%) + 48,061,079 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.98%) + 531,733,278 stalled-cycles-backend:u # 5.79% backend cycles idle (74.86%) + 16,897,381,469 instructions:u # 1.84 insn per cycle + # 0.03 stalled cycles per insn (74.86%) + 2.849085348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.542456e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.959279e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.959279e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.874211 sec - 8,368,078,777 cycles # 2.906 GHz - 16,297,728,457 instructions # 1.95 insn per cycle - 2.880948455 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.132511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.019948e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.019948e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.353312 sec - 7,653,501,619 cycles # 2.279 GHz - 14,351,383,386 instructions # 1.88 insn per cycle - 3.360009404 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index b4ec2d3a38..ec20f0a107 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:10:03 +DATE: 2024-01-28_13:31:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.469420e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.602744e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.150892e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.671847 sec - 2,663,633,770 cycles # 2.936 GHz - 4,118,051,741 instructions # 1.55 insn per cycle - 0.967574090 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.908895e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.595304e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.915035e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.637721 sec + 15,337,436,005 cycles:u # 3.285 GHz (75.00%) + 53,919,682 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.91%) + 6,941,948,091 stalled-cycles-backend:u # 45.26% backend cycles idle (75.05%) + 11,535,932,480 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (75.04%) + 4.692843363 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.977799e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.694572e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.694572e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.589012 sec - 10,801,142,359 cycles # 3.007 GHz - 28,357,477,472 instructions # 2.63 insn per cycle - 3.595542686 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.437871e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.219329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219329e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.306988 sec + 10,920,128,927 cycles:u # 3.273 GHz (74.97%) + 51,678,826 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.06%) + 49,333,079 stalled-cycles-backend:u # 0.45% backend cycles idle (75.07%) + 28,405,049,430 instructions:u # 2.60 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 3.339028113 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.349686e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.544629e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.544629e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.077598 sec - 9,269,407,224 cycles # 3.007 GHz - 21,586,225,370 instructions # 2.33 insn per cycle - 3.084358676 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.603844e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.647404e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.647404e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.136509 sec + 10,320,360,832 cycles:u # 3.260 GHz (75.01%) + 49,989,115 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.99%) + 70,471,619 stalled-cycles-backend:u # 0.68% backend cycles idle (74.99%) + 21,503,541,481 instructions:u # 2.08 insn per cycle + # 0.00 stalled cycles per insn (74.88%) + 3.169925065 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.477292e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.836027e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.836027e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.942679 sec - 8,447,551,404 cycles # 2.865 GHz - 15,943,888,519 instructions # 1.89 insn per cycle - 2.949434950 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.294610e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.771701e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.771701e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.631363 sec + 8,555,088,028 cycles:u # 3.215 GHz (74.96%) + 48,783,917 stalled-cycles-frontend:u # 0.57% frontend cycles idle (75.06%) + 150,017,587 stalled-cycles-backend:u # 1.75% backend cycles idle (75.06%) + 15,831,128,730 instructions:u # 1.85 insn per cycle + # 0.01 stalled cycles per insn (75.05%) + 2.664837791 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.690819e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.403339e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.403339e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.736882 sec - 7,944,112,596 cycles # 2.897 GHz - 15,370,187,528 instructions # 1.93 insn per cycle - 2.743566336 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.268572e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.306657e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.306657e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.173602 sec - 7,340,865,622 cycles # 2.309 GHz - 13,879,899,813 instructions # 1.89 insn per cycle - 3.180576626 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 6f2bced9e0..7990668dd5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_18:29:06 +DATE: 2024-01-28_13:08:27 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.298641e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194659e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.296861e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.570967 sec - 2,334,592,448 cycles # 2.929 GHz - 3,632,363,421 instructions # 1.56 insn per cycle - 0.870945562 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.857196e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.216060e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.974564e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.521435 sec + 14,977,880,374 cycles:u # 3.295 GHz (75.03%) + 53,787,695 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.04%) + 6,894,589,003 stalled-cycles-backend:u # 46.03% backend cycles idle (75.03%) + 11,454,419,619 instructions:u # 0.76 insn per cycle + # 0.60 stalled cycles per insn (75.05%) + 4.570327482 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.076073e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.264714e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.264714e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.215762 sec - 18,583,152,636 cycles # 2.988 GHz - 47,046,423,322 instructions # 2.53 insn per cycle - 6.224321917 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.418580e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645597e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645597e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.103130 sec + 17,318,337,356 cycles:u # 3.377 GHz (74.95%) + 39,375,779 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.02%) + 37,405,246 stalled-cycles-backend:u # 0.22% backend cycles idle (75.05%) + 47,178,870,494 instructions:u # 2.72 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 5.130781235 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.325879e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.548347e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.548347e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.060589 sec - 9,220,984,727 cycles # 3.007 GHz - 22,092,674,966 instructions # 2.40 insn per cycle - 3.078385978 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.949128e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.190153e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.190153e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.797502 sec + 9,212,831,935 cycles:u # 3.264 GHz (75.06%) + 41,259,262 stalled-cycles-frontend:u # 0.45% frontend cycles idle (75.06%) + 630,902,387 stalled-cycles-backend:u # 6.85% backend cycles idle (75.06%) + 22,090,019,438 instructions:u # 2.40 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 2.827553242 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.559835e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.974343e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.974343e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.818676 sec - 8,206,303,802 cycles # 2.906 GHz - 15,625,319,388 instructions # 1.90 insn per cycle - 2.834366031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.419503e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.013338e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.013338e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.504134 sec + 8,196,207,955 cycles:u # 3.241 GHz (74.99%) + 39,538,728 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) + 1,438,596,268 stalled-cycles-backend:u # 17.55% backend cycles idle (75.02%) + 15,521,964,298 instructions:u # 1.89 insn per cycle + # 0.09 stalled cycles per insn (75.02%) + 2.533039511 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.684772e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.307693e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.307693e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.703549 sec - 7,897,847,455 cycles # 2.916 GHz - 15,297,871,291 instructions # 1.94 insn per cycle - 2.719233537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.690237e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.251244e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.251244e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.700056 sec - 6,406,470,129 cycles # 2.368 GHz - 12,623,925,199 instructions # 1.97 insn per cycle - 2.715604692 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index f22799bcd8..673b97ee3f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:21:21 +DATE: 2024-01-28_13:45:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.091110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370205e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.370205e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.693092 sec - 5,717,209,248 cycles # 2.967 GHz - 10,220,644,323 instructions # 1.79 insn per cycle - 1.983808985 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.594584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312331e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.312331e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.342740 sec + 17,767,754,357 cycles:u # 3.310 GHz (74.97%) + 118,735,343 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.97%) + 6,951,532,407 stalled-cycles-backend:u # 39.12% backend cycles idle (74.97%) + 17,078,729,274 instructions:u # 0.96 insn per cycle + # 0.41 stalled cycles per insn (74.94%) + 5.393944782 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.068065e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.251615e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.251615e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.360320 sec - 19,222,492,344 cycles # 3.020 GHz - 47,195,059,630 instructions # 2.46 insn per cycle - 6.368180474 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.408640e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.632906e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.632906e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.190782 sec + 17,512,984,466 cycles:u # 3.354 GHz (74.97%) + 39,606,603 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.03%) + 88,796,249 stalled-cycles-backend:u # 0.51% backend cycles idle (75.03%) + 47,394,033,970 instructions:u # 2.71 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 5.223039115 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.240125e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345873e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345873e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.287034 sec - 9,979,773,014 cycles # 3.031 GHz - 23,429,352,506 instructions # 2.35 insn per cycle - 3.294344601 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.871178e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.029124e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.029124e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.943122 sec + 9,626,438,902 cycles:u # 3.237 GHz (75.01%) + 41,048,676 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.98%) + 690,269,864 stalled-cycles-backend:u # 7.17% backend cycles idle (74.98%) + 23,392,634,777 instructions:u # 2.43 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 2.976979266 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.465891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.742130e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.742130e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.037490 sec - 8,892,438,239 cycles # 2.922 GHz - 16,751,131,896 instructions # 1.88 insn per cycle - 3.044961686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.317484e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.804012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.804012e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.641383 sec + 8,570,286,537 cycles:u # 3.208 GHz (74.87%) + 39,777,448 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.85%) + 1,467,952,662 stalled-cycles-backend:u # 17.13% backend cycles idle (74.90%) + 16,609,873,977 instructions:u # 1.94 insn per cycle + # 0.09 stalled cycles per insn (75.05%) + 2.675246600 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.544387e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.966168e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.966168e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.957079 sec - 8,640,458,152 cycles # 2.916 GHz - 16,423,625,949 instructions # 1.90 insn per cycle - 2.964471548 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.457706e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.744955e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.744955e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.055377 sec - 7,136,558,861 cycles # 2.331 GHz - 13,849,895,297 instructions # 1.94 insn per cycle - 3.062598456 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 45389c409a..b4a4449cb7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:34:51 +DATE: 2024-01-28_13:55:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.299345e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.174091e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247451e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.177271 sec - 4,143,091,386 cycles # 2.965 GHz - 6,559,856,203 instructions # 1.58 insn per cycle - 1.455622740 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.696384e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.226897e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.984829e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.529425 sec + 14,979,431,077 cycles:u # 3.289 GHz (75.05%) + 54,269,019 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.05%) + 7,021,630,686 stalled-cycles-backend:u # 46.88% backend cycles idle (74.98%) + 11,114,469,756 instructions:u # 0.74 insn per cycle + # 0.63 stalled cycles per insn (74.97%) + 4.577625816 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.075887e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.265834e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.265834e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.419098e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.646169e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.646169e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.556546 sec - 19,614,107,542 cycles # 2.990 GHz - 47,234,086,077 instructions # 2.41 insn per cycle - 6.562589182 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.101861 sec + 17,329,919,651 cycles:u # 3.380 GHz (74.94%) + 39,371,838 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.01%) + 34,710,609 stalled-cycles-backend:u # 0.20% backend cycles idle (75.04%) + 47,183,984,338 instructions:u # 2.72 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.130521932 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.322270e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.543513e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.543513e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.927016e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.134988e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.134988e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.397249 sec - 10,249,413,020 cycles # 3.013 GHz - 22,172,282,512 instructions # 2.16 insn per cycle - 3.403512517 seconds time elapsed +TOTAL : 2.817773 sec + 9,282,007,688 cycles:u # 3.265 GHz (74.96%) + 40,680,415 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.96%) + 649,301,144 stalled-cycles-backend:u # 7.00% backend cycles idle (74.96%) + 22,192,724,454 instructions:u # 2.39 insn per cycle + # 0.03 stalled cycles per insn (74.84%) + 2.845062092 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.554657e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.975014e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.975014e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.418424e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.011922e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.011922e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.159703 sec - 9,187,571,116 cycles # 2.903 GHz - 15,535,454,607 instructions # 1.69 insn per cycle - 3.165655811 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.504992 sec + 8,200,675,323 cycles:u # 3.241 GHz (74.96%) + 39,490,772 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) + 1,444,236,897 stalled-cycles-backend:u # 17.61% backend cycles idle (75.02%) + 15,520,221,160 instructions:u # 1.89 insn per cycle + # 0.09 stalled cycles per insn (75.03%) + 2.532599396 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.670952e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.286921e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.286921e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.060873 sec - 8,923,018,524 cycles # 2.911 GHz - 15,006,649,649 instructions # 1.68 insn per cycle - 3.067260470 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.684276e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.241659e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.241659e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.051374 sec - 7,450,470,409 cycles # 2.438 GHz - 12,333,404,291 instructions # 1.66 insn per cycle - 3.057604398 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index e73503aded..6e6cd5e5eb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:31:29 +DATE: 2024-01-28_13:54:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.302768e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181644e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.262369e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.866658 sec - 3,108,411,536 cycles # 2.855 GHz - 6,348,867,669 instructions # 2.04 insn per cycle - 1.146196691 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 54,856,998 cycles:u # 2.647 GHz (61.43%) + 47,618 stalled-cycles-frontend:u # 0.09% frontend cycles idle (61.44%) + 635,574 stalled-cycles-backend:u # 1.16% backend cycles idle (61.43%) + 43,586,624 instructions:u # 0.79 insn per cycle + # 0.01 stalled cycles per insn (61.91%) + 0.021324806 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.088138e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.278620e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.278620e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.146091 sec - 18,574,305,773 cycles # 3.020 GHz - 47,046,243,044 instructions # 2.53 insn per cycle - 6.152109127 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted + 42,059,852 cycles:u # 2.045 GHz (61.13%) + 62,739 stalled-cycles-frontend:u # 0.15% frontend cycles idle (61.13%) + 384,873 stalled-cycles-backend:u # 0.92% backend cycles idle (61.30%) + 47,983,719 instructions:u # 1.14 insn per cycle + # 0.01 stalled cycles per insn (74.16%) + 0.021642865 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.274001e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.460512e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.460512e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.132795 sec - 9,215,263,646 cycles # 2.939 GHz - 22,089,271,619 instructions # 2.40 insn per cycle - 3.138223391 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted + 55,264,236 cycles:u # 2.696 GHz (61.00%) + 45,621 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.01%) + 619,143 stalled-cycles-backend:u # 1.12% backend cycles idle (61.01%) + 42,869,609 instructions:u # 0.78 insn per cycle + # 0.01 stalled cycles per insn (61.51%) + 0.021507280 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.567359e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.986612e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.986612e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.814203 sec - 8,177,091,662 cycles # 2.901 GHz - 15,624,445,850 instructions # 1.91 insn per cycle - 2.820203735 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted + 51,252,361 cycles:u # 2.482 GHz (61.29%) + 43,034 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.30%) + 555,172 stalled-cycles-backend:u # 1.08% backend cycles idle (61.30%) + 45,320,101 instructions:u # 0.88 insn per cycle + # 0.01 stalled cycles per insn (63.05%) + 0.021723512 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.681978e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.294941e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.294941e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.704822 sec - 7,875,784,738 cycles # 2.906 GHz - 15,295,996,369 instructions # 1.94 insn per cycle - 2.710825901 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.674756e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.219292e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.219292e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.714087 sec - 6,412,065,146 cycles # 2.358 GHz - 12,623,099,114 instructions # 1.97 insn per cycle - 2.720288344 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 5d7294e83b..c52d2f3b6b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:28:08 +DATE: 2024-01-28_13:51:38 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.932152e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107082e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.127630e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.482237 sec - 5,093,185,858 cycles # 2.977 GHz - 9,216,243,099 instructions # 1.81 insn per cycle - 1.767188248 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.367357e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.035509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.758783e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.244736 sec + 17,536,732,896 cycles:u # 3.326 GHz (74.91%) + 119,175,031 stalled-cycles-frontend:u # 0.68% frontend cycles idle (74.98%) + 6,903,153,396 stalled-cycles-backend:u # 39.36% backend cycles idle (74.96%) + 16,743,149,372 instructions:u # 0.95 insn per cycle + # 0.41 stalled cycles per insn (74.97%) + 5.289589090 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.079462e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.279176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279176e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.194746 sec - 18,699,626,867 cycles # 3.016 GHz - 47,046,281,728 instructions # 2.52 insn per cycle - 6.200980585 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.418632e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645414e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645414e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.102122 sec + 17,315,339,907 cycles:u # 3.377 GHz (74.95%) + 39,374,566 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.03%) + 39,011,627 stalled-cycles-backend:u # 0.23% backend cycles idle (75.04%) + 47,172,305,604 instructions:u # 2.72 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.129517275 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.311625e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.528131e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.528131e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.084622 sec - 9,236,990,240 cycles # 2.992 GHz - 22,092,949,356 instructions # 2.39 insn per cycle - 3.090816103 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.945940e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.187792e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.187792e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.802417 sec + 9,259,359,761 cycles:u # 3.275 GHz (74.84%) + 41,158,501 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.87%) + 639,991,478 stalled-cycles-backend:u # 6.91% backend cycles idle (75.00%) + 22,117,141,816 instructions:u # 2.39 insn per cycle + # 0.03 stalled cycles per insn (75.11%) + 2.829728232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.536730e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.937077e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.937077e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.847164 sec - 8,175,263,272 cycles # 2.867 GHz - 15,626,202,398 instructions # 1.91 insn per cycle - 2.853472032 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.418158e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.012006e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.012006e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.505742 sec + 8,205,932,650 cycles:u # 3.243 GHz (74.94%) + 39,626,552 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.03%) + 1,450,339,071 stalled-cycles-backend:u # 17.67% backend cycles idle (75.03%) + 15,521,271,212 instructions:u # 1.89 insn per cycle + # 0.09 stalled cycles per insn (75.03%) + 2.533203757 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.677071e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.284453e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.284453e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.710590 sec - 7,878,414,326 cycles # 2.901 GHz - 15,295,945,032 instructions # 1.94 insn per cycle - 2.716499916 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.677827e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.225134e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.225134e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.710843 sec - 6,424,470,372 cycles # 2.366 GHz - 12,623,450,760 instructions # 1.96 insn per cycle - 2.716911235 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index fb199146e4..46a5acafae 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_18:29:37 +DATE: 2024-01-28_13:08:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.298570e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200393e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328660e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.567315 sec - 2,331,905,478 cycles # 2.937 GHz - 3,641,453,783 instructions # 1.56 insn per cycle - 0.866083617 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.900843e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.260720e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.038102e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.521444 sec + 14,976,634,085 cycles:u # 3.293 GHz (75.04%) + 53,985,383 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.05%) + 6,901,037,652 stalled-cycles-backend:u # 46.08% backend cycles idle (75.04%) + 11,488,651,226 instructions:u # 0.77 insn per cycle + # 0.60 stalled cycles per insn (75.06%) + 4.571984011 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.143460e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.358554e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.358554e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.860932 sec - 17,746,278,890 cycles # 3.026 GHz - 43,887,716,368 instructions # 2.47 insn per cycle - 5.869103029 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.542902e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814867e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814867e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.744903 sec + 16,072,266,413 cycles:u # 3.370 GHz (74.96%) + 39,424,962 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.01%) + 35,814,307 stalled-cycles-backend:u # 0.22% backend cycles idle (75.01%) + 44,034,562,321 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 4.772487503 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.385096e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.700933e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.700933e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.994427 sec - 9,034,947,440 cycles # 3.012 GHz - 21,581,997,443 instructions # 2.39 insn per cycle - 3.009305656 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.020745e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.326154e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.326154e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.748753 sec + 9,050,825,753 cycles:u # 3.263 GHz (74.92%) + 41,840,049 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.91%) + 116,810,300 stalled-cycles-backend:u # 1.29% backend cycles idle (74.93%) + 21,679,103,378 instructions:u # 2.40 insn per cycle + # 0.01 stalled cycles per insn (74.94%) + 2.777807373 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.569603e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.998364e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.998364e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.814567 sec - 8,181,606,187 cycles # 2.903 GHz - 15,432,175,910 instructions # 1.89 insn per cycle - 2.829746470 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.467576e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.119998e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.119998e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.476518 sec + 8,099,155,876 cycles:u # 3.238 GHz (74.96%) + 39,649,327 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.06%) + 1,776,371,700 stalled-cycles-backend:u # 21.93% backend cycles idle (75.06%) + 15,292,619,997 instructions:u # 1.89 insn per cycle + # 0.12 stalled cycles per insn (75.06%) + 2.505693559 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.686786e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.309382e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.309382e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.704117 sec - 7,855,075,052 cycles # 2.899 GHz - 15,087,119,018 instructions # 1.92 insn per cycle - 2.721017005 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.750668e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.464689e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.464689e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.657551 sec - 6,188,907,055 cycles # 2.325 GHz - 12,247,038,736 instructions # 1.98 insn per cycle - 2.670267656 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 91bd8b8a95..daac20482b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:10:32 +DATE: 2024-01-28_13:31:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.300007e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.186316e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.282085e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.569021 sec - 2,334,082,963 cycles # 2.936 GHz - 3,662,913,290 instructions # 1.57 insn per cycle - 0.853460362 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.898761e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.220331e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.976189e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.520492 sec + 15,059,249,887 cycles:u # 3.311 GHz (74.86%) + 54,631,651 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.01%) + 6,985,929,963 stalled-cycles-backend:u # 46.39% backend cycles idle (75.05%) + 11,318,993,683 instructions:u # 0.75 insn per cycle + # 0.62 stalled cycles per insn (75.03%) + 4.572850398 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.487465e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.868119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.868119e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.589418 sec - 13,761,744,502 cycles # 2.995 GHz - 37,848,018,392 instructions # 2.75 insn per cycle - 4.595713118 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.929607e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.375061e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.375061e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.928807 sec + 13,191,423,416 cycles:u # 3.336 GHz (74.92%) + 40,090,845 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.93%) + 1,171,298,475 stalled-cycles-backend:u # 8.88% backend cycles idle (74.93%) + 38,076,177,040 instructions:u # 2.89 insn per cycle + # 0.03 stalled cycles per insn (74.94%) + 3.957305605 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039414671366E-002 -Relative difference = 4.562884388571957e-08 +Avg ME (F77/C++) = 1.2828039543819614E-002 +Relative difference = 3.5561191488957804e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.784278e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.740957e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.740957e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.614438 sec - 7,924,064,298 cycles # 3.025 GHz - 18,603,596,851 instructions # 2.35 insn per cycle - 2.621027066 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.477612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.337347e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.337347e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.477386 sec + 8,106,573,579 cycles:u # 3.238 GHz (74.86%) + 40,966,446 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.02%) + 252,074,197 stalled-cycles-backend:u # 3.11% backend cycles idle (75.08%) + 18,635,497,586 instructions:u # 2.30 insn per cycle + # 0.01 stalled cycles per insn (75.08%) + 2.507314949 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.819736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.685571e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.685571e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.599024 sec - 7,413,580,157 cycles # 2.847 GHz - 14,339,699,168 instructions # 1.93 insn per cycle - 2.605472047 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.859236e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.023346e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.023346e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.293893 sec + 7,466,220,256 cycles:u # 3.218 GHz (74.69%) + 40,524,921 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.69%) + 1,088,040,683 stalled-cycles-backend:u # 14.57% backend cycles idle (75.01%) + 14,239,691,356 instructions:u # 1.91 insn per cycle + # 0.08 stalled cycles per insn (75.18%) + 2.323651015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053246266791E-002 -Relative difference = 2.5306003563303186e-07 +Avg ME (F77/C++) = 1.2828053337216261E-002 +Relative difference = 2.601499261602198e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.938769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.994926e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.994926e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.497547 sec - 7,313,071,613 cycles # 2.922 GHz - 13,954,423,658 instructions # 1.91 insn per cycle - 2.504054684 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053277189611E-002 -Relative difference = 2.5547059841227576e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.785331e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.502319e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.502319e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.620000 sec - 6,277,230,496 cycles # 2.391 GHz - 13,208,120,155 instructions # 2.10 insn per cycle - 2.626514632 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052540498902E-002 -Relative difference = 1.980424851420537e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 8eea08f24e..3b7030832c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_19:11:00 +DATE: 2024-01-28_13:32:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.295810e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202482e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.333937e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.565983 sec - 2,308,536,588 cycles # 2.913 GHz - 3,558,580,353 instructions # 1.54 insn per cycle - 0.849931283 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.721468e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.255914e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.027322e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.530061 sec + 14,994,454,794 cycles:u # 3.292 GHz (75.04%) + 54,303,127 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.07%) + 6,983,878,696 stalled-cycles-backend:u # 46.58% backend cycles idle (74.99%) + 11,268,174,387 instructions:u # 0.75 insn per cycle + # 0.62 stalled cycles per insn (75.01%) + 4.578127450 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.086474e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.918622e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.918622e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.373319 sec - 10,128,204,715 cycles # 2.998 GHz - 28,399,238,885 instructions # 2.80 insn per cycle - 3.379817629 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.676795e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.617824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.617824e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.020069 sec + 10,020,679,446 cycles:u # 3.290 GHz (74.84%) + 38,651,870 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.97%) + 29,763,435 stalled-cycles-backend:u # 0.30% backend cycles idle (75.05%) + 28,572,799,722 instructions:u # 2.85 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 3.048326361 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.064441e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.600432e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.600432e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.411342 sec - 7,296,023,056 cycles # 3.019 GHz - 16,785,936,380 instructions # 2.30 insn per cycle - 2.418115810 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.864511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.307467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.307467e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.296267 sec + 7,446,477,849 cycles:u # 3.207 GHz (74.71%) + 40,130,343 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.75%) + 35,437,360 stalled-cycles-backend:u # 0.48% backend cycles idle (75.07%) + 16,867,197,741 instructions:u # 2.27 insn per cycle + # 0.00 stalled cycles per insn (75.20%) + 2.326184137 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.033566e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.263699e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.263699e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.433739 sec - 7,119,919,933 cycles # 2.919 GHz - 13,729,208,550 instructions # 1.93 insn per cycle - 2.440430559 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.057425e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.514930e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.514930e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.217339 sec + 7,173,513,532 cycles:u # 3.198 GHz (75.01%) + 39,854,558 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.04%) + 383,455,538 stalled-cycles-backend:u # 5.35% backend cycles idle (75.04%) + 13,648,938,720 instructions:u # 1.90 insn per cycle + # 0.03 stalled cycles per insn (75.04%) + 2.247204928 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 +Avg ME (F77/C++) = 1.2828053331759293E-002 +Relative difference = 2.597245327285885e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.014449e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.261345e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.261345e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.447471 sec - 7,063,168,115 cycles # 2.883 GHz - 13,462,876,791 instructions # 1.91 insn per cycle - 2.454197899 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.900977e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.825415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.825415e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.531713 sec - 6,065,556,227 cycles # 2.391 GHz - 12,910,852,886 instructions # 2.13 insn per cycle - 2.538092940 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 832212d518..82cbbddff3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_18:30:07 +DATE: 2024-01-28_13:09:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.454943e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.580243e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.121312e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.672615 sec - 2,659,516,695 cycles # 2.930 GHz - 4,189,600,657 instructions # 1.58 insn per cycle - 0.981568577 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.320880e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.106815e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.335809e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.650032 sec + 15,393,538,908 cycles:u # 3.291 GHz (74.99%) + 53,951,381 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.87%) + 6,953,241,438 stalled-cycles-backend:u # 45.17% backend cycles idle (74.87%) + 11,556,335,890 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (75.02%) + 4.701473801 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590281E-002 +Relative difference = 7.67145406542181e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.017729e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180513e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180513e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.582975 sec - 19,745,039,576 cycles # 2.997 GHz - 46,971,043,157 instructions # 2.38 insn per cycle - 6.591430351 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.243334e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.416802e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.416802e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.782818 sec + 19,603,752,476 cycles:u # 3.374 GHz (74.95%) + 52,047,296 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.96%) + 155,941,389 stalled-cycles-backend:u # 0.80% backend cycles idle (74.95%) + 47,111,954,713 instructions:u # 2.40 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 5.813875962 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.645408e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.170213e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.170213e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.230014 sec - 12,493,958,485 cycles # 2.949 GHz - 30,922,835,853 instructions # 2.48 insn per cycle - 4.249036641 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.992307e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.534851e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.534851e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.878462 sec + 12,930,844,192 cycles:u # 3.310 GHz (75.02%) + 50,945,470 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) + 2,179,183,100 stalled-cycles-backend:u # 16.85% backend cycles idle (75.02%) + 30,864,577,310 instructions:u # 2.39 insn per cycle + # 0.07 stalled cycles per insn (75.02%) + 3.910479605 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.995238e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.784735e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.784735e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.553540 sec - 10,262,583,129 cycles # 2.883 GHz - 19,549,264,327 instructions # 1.90 insn per cycle - 3.571464047 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.583056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.412060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.412060e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.156866 sec + 10,403,980,296 cycles:u # 3.267 GHz (74.90%) + 49,718,589 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.89%) + 897,671,639 stalled-cycles-backend:u # 8.63% backend cycles idle (74.97%) + 19,404,180,648 instructions:u # 1.87 insn per cycle + # 0.05 stalled cycles per insn (75.09%) + 3.188892056 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.124505e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.036900e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.036900e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.360416 sec - 9,705,312,606 cycles # 2.883 GHz - 18,859,288,785 instructions # 1.94 insn per cycle - 3.374514912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.964752e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.703026e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.703026e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.605664 sec - 8,110,671,606 cycles # 2.246 GHz - 14,814,965,569 instructions # 1.83 insn per cycle - 3.618037534 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index d9aa3524ae..62e74bce4b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-27_18:30:42 +DATE: 2024-01-28_13:09:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.450313e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.588685e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.109937e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.674384 sec - 2,670,081,298 cycles # 2.940 GHz - 4,170,136,902 instructions # 1.56 insn per cycle - 0.981068595 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.914249e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.588634e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.909522e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.630922 sec + 15,311,777,524 cycles:u # 3.287 GHz (74.97%) + 53,797,529 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.97%) + 6,956,915,971 stalled-cycles-backend:u # 45.44% backend cycles idle (74.99%) + 11,493,289,788 instructions:u # 0.75 insn per cycle + # 0.61 stalled cycles per insn (75.02%) + 4.682003984 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590284E-002 +Relative difference = 7.67145379496374e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.097800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.287094e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.287094e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.133979 sec - 18,525,337,668 cycles # 3.018 GHz - 44,592,174,478 instructions # 2.41 insn per cycle - 6.142638569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.314424e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512274e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512274e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.509220 sec + 18,660,709,287 cycles:u # 3.370 GHz (74.98%) + 52,180,839 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.01%) + 66,543,074 stalled-cycles-backend:u # 0.36% backend cycles idle (75.01%) + 44,638,453,582 instructions:u # 2.39 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 5.539870253 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.703487e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.266234e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.266234e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.099663 sec - 12,180,659,451 cycles # 2.973 GHz - 30,220,479,220 instructions # 2.48 insn per cycle - 4.117142209 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.008406e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.555937e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.555937e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.851213 sec + 12,866,795,762 cycles:u # 3.317 GHz (74.86%) + 55,538,083 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.97%) + 1,919,872,856 stalled-cycles-backend:u # 14.92% backend cycles idle (75.05%) + 30,173,135,111 instructions:u # 2.35 insn per cycle + # 0.06 stalled cycles per insn (75.05%) + 3.883388788 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.986943e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.775011e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.775011e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.571308 sec - 10,218,424,056 cycles # 2.857 GHz - 19,038,472,456 instructions # 1.86 insn per cycle - 3.585618618 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.614465e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.455102e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.455102e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.126652 sec + 10,226,352,605 cycles:u # 3.241 GHz (74.91%) + 44,861,835 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.92%) + 260,075,731 stalled-cycles-backend:u # 2.54% backend cycles idle (74.91%) + 19,023,510,957 instructions:u # 1.86 insn per cycle + # 0.01 stalled cycles per insn (74.96%) + 3.158892568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.155033e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.101436e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.101436e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.316742 sec - 9,589,461,377 cycles # 2.886 GHz - 18,452,385,566 instructions # 1.92 insn per cycle - 3.332340390 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.319997e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.417969e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.417969e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.114149 sec - 7,202,313,446 cycles # 2.308 GHz - 13,242,868,760 instructions # 1.84 insn per cycle - 3.127979565 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 1d57a488e8..b39b2317cf 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_18:31:15 +DATE: 2024-01-28_13:10:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.551731e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158111e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273591e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525728 sec - 2,253,076,627 cycles # 2.922 GHz - 3,207,655,601 instructions # 1.42 insn per cycle - 0.846879808 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.775687e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.960414e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.014344e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.057707 sec + 3,244,067,809 cycles:u # 2.994 GHz (74.80%) + 10,927,029 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.60%) + 1,144,622,647 stalled-cycles-backend:u # 35.28% backend cycles idle (75.00%) + 2,968,035,234 instructions:u # 0.91 insn per cycle + # 0.39 stalled cycles per insn (75.25%) + 1.109400147 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.135550e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.199605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.199605e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.018432 sec - 14,961,002,228 cycles # 2.978 GHz - 38,722,736,643 instructions # 2.59 insn per cycle - 5.027069823 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.518752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.584131e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.584131e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.339121 sec + 14,991,957,756 cycles:u # 3.432 GHz (74.93%) + 8,916,309 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) + 680,019,338 stalled-cycles-backend:u # 4.54% backend cycles idle (74.91%) + 38,746,350,584 instructions:u # 2.58 insn per cycle + # 0.02 stalled cycles per insn (74.99%) + 4.371226147 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.629750e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.834785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.834785e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.997221 sec - 8,956,048,148 cycles # 2.983 GHz - 24,430,777,255 instructions # 2.73 insn per cycle - 3.011580672 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.488177e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.712575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.712575e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.511930 sec + 8,602,900,108 cycles:u # 3.386 GHz (74.90%) + 9,613,119 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.84%) + 200,536,797 stalled-cycles-backend:u # 2.33% backend cycles idle (74.84%) + 24,409,281,538 instructions:u # 2.84 insn per cycle + # 0.01 stalled cycles per insn (75.00%) + 2.544912038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.732472e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.230367e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.230367e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.937260 sec - 5,537,687,120 cycles # 2.850 GHz - 11,562,218,639 instructions # 2.09 insn per cycle - 1.954475286 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.685094e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.280006e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.280006e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.540578 sec + 5,164,515,981 cycles:u # 3.289 GHz (75.04%) + 7,867,047 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.04%) + 1,067,968,522 stalled-cycles-backend:u # 20.68% backend cycles idle (75.04%) + 11,505,183,362 instructions:u # 2.23 insn per cycle + # 0.09 stalled cycles per insn (75.04%) + 1.573597661 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.676829e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.369329e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.369329e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.676387 sec - 4,810,331,929 cycles # 2.859 GHz - 10,339,255,035 instructions # 2.15 insn per cycle - 1.693288563 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.334499e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.612056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.612056e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.527112 sec - 4,949,555,328 cycles # 1.954 GHz - 7,556,291,004 instructions # 1.53 insn per cycle - 2.542441624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index f32c05e165..76ed1fb7b1 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:21:54 +DATE: 2024-01-28_13:45:54 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.443181e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.836789e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.836789e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.815552 sec - 3,152,223,253 cycles # 2.950 GHz - 4,839,428,338 instructions # 1.54 insn per cycle - 1.127787770 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.985499e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.802862e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.802862e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.228595 sec + 3,735,435,804 cycles:u # 2.952 GHz (74.95%) + 22,377,197 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.72%) + 1,174,440,900 stalled-cycles-backend:u # 31.44% backend cycles idle (74.73%) + 3,886,989,918 instructions:u # 1.04 insn per cycle + # 0.30 stalled cycles per insn (75.04%) + 1.290048457 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.139565e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.202609e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.202609e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.087770 sec - 15,324,937,529 cycles # 3.010 GHz - 38,785,835,079 instructions # 2.53 insn per cycle - 5.095438900 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.513057e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577274e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.424072 sec + 15,124,386,423 cycles:u # 3.389 GHz (74.91%) + 10,131,846 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.91%) + 795,869,225 stalled-cycles-backend:u # 5.26% backend cycles idle (74.91%) + 38,688,842,712 instructions:u # 2.56 insn per cycle + # 0.02 stalled cycles per insn (75.00%) + 4.465717630 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.642792e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.847282e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.847282e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.061852 sec - 9,297,134,773 cycles # 3.030 GHz - 24,611,929,147 instructions # 2.65 insn per cycle - 3.069748837 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.456146e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.677718e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.677718e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.614372 sec + 8,741,909,741 cycles:u # 3.294 GHz (74.86%) + 9,776,807 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.86%) + 212,474,586 stalled-cycles-backend:u # 2.43% backend cycles idle (74.99%) + 24,606,463,326 instructions:u # 2.81 insn per cycle + # 0.01 stalled cycles per insn (74.99%) + 2.658072308 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.618551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.100275e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.100275e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.052297 sec - 5,885,039,003 cycles # 2.858 GHz - 11,848,510,845 instructions # 2.01 insn per cycle - 2.060062165 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 7.584321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.161048e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.161048e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.643539 sec + 5,359,056,860 cycles:u # 3.183 GHz (74.86%) + 8,448,236 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.82%) + 1,100,765,531 stalled-cycles-backend:u # 20.54% backend cycles idle (74.80%) + 11,831,761,142 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (74.82%) + 1.687195550 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.504273e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.161017e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.161017e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.799191 sec - 5,170,745,323 cycles # 2.863 GHz - 10,625,305,495 instructions # 2.05 insn per cycle - 1.807054488 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.143672e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.398027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.398027e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.719298 sec - 5,299,827,945 cycles # 1.944 GHz - 7,799,359,597 instructions # 1.47 insn per cycle - 2.727100647 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index ac84fb1512..523a7bca51 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:35:24 +DATE: 2024-01-28_13:55:56 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.553160e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155068e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270145e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.818186e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.966197e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.020465e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.621735 sec - 2,489,043,210 cycles # 2.929 GHz - 3,562,840,484 instructions # 1.43 insn per cycle - 0.909390238 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.054122 sec + 3,218,591,678 cycles:u # 2.968 GHz (74.69%) + 10,707,638 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.92%) + 1,102,035,877 stalled-cycles-backend:u # 34.24% backend cycles idle (74.89%) + 3,000,124,436 instructions:u # 0.93 insn per cycle + # 0.37 stalled cycles per insn (74.87%) + 1.106279619 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.159128e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.223397e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.223397e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.514969e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.580100e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.580100e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.024025 sec - 15,162,088,953 cycles # 3.015 GHz - 38,738,505,601 instructions # 2.55 insn per cycle - 5.030201813 seconds time elapsed +TOTAL : 4.346990 sec + 15,001,586,306 cycles:u # 3.426 GHz (74.97%) + 9,215,025 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 839,349,990 stalled-cycles-backend:u # 5.60% backend cycles idle (74.98%) + 38,768,715,929 instructions:u # 2.58 insn per cycle + # 0.02 stalled cycles per insn (74.98%) + 4.380757819 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.533105e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.728807e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.728807e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.493557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.718867e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.718867e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.136715 sec - 9,148,263,180 cycles # 2.918 GHz - 24,432,145,400 instructions # 2.67 insn per cycle - 3.143009438 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.510137 sec + 8,597,251,783 cycles:u # 3.385 GHz (74.81%) + 9,316,093 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.90%) + 191,978,085 stalled-cycles-backend:u # 2.23% backend cycles idle (75.06%) + 24,324,544,763 instructions:u # 2.83 insn per cycle + # 0.01 stalled cycles per insn (75.12%) + 2.542379153 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.704073e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.211191e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.211191e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.691867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.285163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.285163e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.008048 sec - 5,721,668,107 cycles # 2.843 GHz - 11,545,118,469 instructions # 2.02 insn per cycle - 2.014569715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.540808 sec + 5,173,463,246 cycles:u # 3.293 GHz (75.05%) + 8,639,028 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.05%) + 1,060,923,299 stalled-cycles-backend:u # 20.51% backend cycles idle (75.05%) + 11,469,896,836 instructions:u # 2.22 insn per cycle + # 0.09 stalled cycles per insn (75.06%) + 1.573272946 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.616897e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.301045e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.301045e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.752962 sec - 5,013,464,915 cycles # 2.851 GHz - 10,287,485,495 instructions # 2.05 insn per cycle - 1.759537330 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.330900e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.607229e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.607229e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.590954 sec - 5,124,956,809 cycles # 1.974 GHz - 7,502,866,606 instructions # 1.46 insn per cycle - 2.597313801 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 8419d20f23..b10f7871e6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:32:00 +DATE: 2024-01-28_13:54:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.546501e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155022e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270969e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.562633 sec - 2,295,587,413 cycles # 2.889 GHz - 3,520,473,490 instructions # 1.53 insn per cycle - 0.852379916 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 55,592,144 cycles:u # 2.674 GHz (61.55%) + 45,049 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.55%) + 659,446 stalled-cycles-backend:u # 1.19% backend cycles idle (61.55%) + 41,558,480 instructions:u # 0.75 insn per cycle + # 0.02 stalled cycles per insn (63.66%) + 0.021719403 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.166522e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.231205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.231205e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.947217 sec - 14,985,234,554 cycles # 3.027 GHz - 38,723,944,283 instructions # 2.58 insn per cycle - 4.953553340 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted + 41,534,797 cycles:u # 2.016 GHz (61.20%) + 59,946 stalled-cycles-frontend:u # 0.14% frontend cycles idle (61.20%) + 347,247 stalled-cycles-backend:u # 0.84% backend cycles idle (61.43%) + 48,334,762 instructions:u # 1.16 insn per cycle + # 0.01 stalled cycles per insn (74.40%) + 0.021856960 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.688654e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.894194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.894194e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.948182 sec - 8,953,378,937 cycles # 3.032 GHz - 24,428,439,372 instructions # 2.73 insn per cycle - 2.954516821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted + 41,633,925 cycles:u # 2.004 GHz (61.54%) + 54,713 stalled-cycles-frontend:u # 0.13% frontend cycles idle (61.54%) + 364,326 stalled-cycles-backend:u # 0.88% backend cycles idle (54.46%) + 48,568,253 instructions:u # 1.17 insn per cycle + # 0.01 stalled cycles per insn (73.69%) + 0.021960561 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.719036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.215680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.215680e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.940059 sec - 5,554,282,154 cycles # 2.855 GHz - 11,561,246,559 instructions # 2.08 insn per cycle - 1.946824789 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted + 55,539,954 cycles:u # 2.689 GHz (61.30%) + 44,952 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.30%) + 601,127 stalled-cycles-backend:u # 1.08% backend cycles idle (61.30%) + 41,345,033 instructions:u # 0.74 insn per cycle + # 0.01 stalled cycles per insn (63.10%) + 0.021938930 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.429208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.087522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.087522e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.739475 sec - 4,812,378,358 cycles # 2.758 GHz - 10,338,594,579 instructions # 2.15 insn per cycle - 1.745816916 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.316145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.588918e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.588918e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.536864 sec - 4,945,627,878 cycles # 1.945 GHz - 7,553,585,636 instructions # 1.53 insn per cycle - 2.543212075 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index fafe86fb7f..69cc2c26c0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:28:40 +DATE: 2024-01-28_13:52:02 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.803191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.152740e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269734e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.709880 sec - 2,799,296,914 cycles # 2.939 GHz - 4,368,443,555 instructions # 1.56 insn per cycle - 1.010847274 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.820548e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967686e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.021920e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.171354 sec + 3,639,696,725 cycles:u # 3.021 GHz (74.98%) + 21,809,161 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.77%) + 1,134,646,968 stalled-cycles-backend:u # 31.17% backend cycles idle (74.47%) + 3,841,517,977 instructions:u # 1.06 insn per cycle + # 0.30 stalled cycles per insn (74.69%) + 1.224070359 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.160840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.225195e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.225195e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.959059 sec - 14,967,483,942 cycles # 3.016 GHz - 38,722,165,482 instructions # 2.59 insn per cycle - 4.965370367 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.516404e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.581636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.581636e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.343161 sec + 14,989,971,532 cycles:u # 3.428 GHz (74.94%) + 8,919,078 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) + 760,504,082 stalled-cycles-backend:u # 5.07% backend cycles idle (74.95%) + 38,772,327,618 instructions:u # 2.59 insn per cycle + # 0.02 stalled cycles per insn (74.94%) + 4.377763204 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.669279e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.872934e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.872934e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.964577 sec - 8,950,291,023 cycles # 3.014 GHz - 24,429,249,082 instructions # 2.73 insn per cycle - 2.971122655 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.495611e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.720881e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.720881e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.509925 sec + 8,593,440,049 cycles:u # 3.382 GHz (74.82%) + 9,282,790 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.86%) + 199,808,787 stalled-cycles-backend:u # 2.33% backend cycles idle (75.02%) + 24,320,330,564 instructions:u # 2.83 insn per cycle + # 0.01 stalled cycles per insn (75.14%) + 2.543748452 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.730432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.235711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.235711e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.938370 sec - 5,533,844,555 cycles # 2.848 GHz - 11,561,296,052 instructions # 2.09 insn per cycle - 1.944753850 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.687059e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.280702e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.280702e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.541096 sec + 5,172,320,908 cycles:u # 3.292 GHz (74.96%) + 8,064,746 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.05%) + 1,064,659,158 stalled-cycles-backend:u # 20.58% backend cycles idle (75.05%) + 11,490,392,245 instructions:u # 2.22 insn per cycle + # 0.09 stalled cycles per insn (75.06%) + 1.573435516 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.564439e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.248886e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.248886e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.706782 sec - 4,816,337,213 cycles # 2.813 GHz - 10,338,424,480 instructions # 2.15 insn per cycle - 1.713243341 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.268560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.543865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.543865e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.565289 sec - 4,934,138,178 cycles # 1.920 GHz - 7,553,812,035 instructions # 1.53 insn per cycle - 2.571920195 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index a132a6cc9e..0f0d7b7fde 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_18:31:43 +DATE: 2024-01-28_13:10:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.565902e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.160035e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275649e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.524945 sec - 2,211,104,325 cycles # 2.917 GHz - 3,161,744,126 instructions # 1.43 insn per cycle - 0.831302328 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.753738e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.923588e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.976812e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.054772 sec + 3,211,784,029 cycles:u # 2.968 GHz (74.94%) + 10,780,601 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.90%) + 1,168,643,143 stalled-cycles-backend:u # 36.39% backend cycles idle (75.33%) + 2,939,310,363 instructions:u # 0.92 insn per cycle + # 0.40 stalled cycles per insn (75.31%) + 1.104771986 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.211402e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.278629e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.278629e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.847715 sec - 14,695,905,192 cycles # 3.028 GHz - 39,546,427,226 instructions # 2.69 insn per cycle - 4.856366875 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.435774e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.497108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.497108e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.479308 sec + 15,482,569,040 cycles:u # 3.434 GHz (74.98%) + 8,881,462 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) + 22,813,560 stalled-cycles-backend:u # 0.15% backend cycles idle (74.98%) + 39,561,506,752 instructions:u # 2.56 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 4.510830156 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.853804e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.080865e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.080865e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.826209 sec - 8,585,516,509 cycles # 3.032 GHz - 23,576,146,180 instructions # 2.75 insn per cycle - 2.839651136 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.395625e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.613119e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.613119e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.560908 sec + 8,744,328,727 cycles:u # 3.376 GHz (74.98%) + 9,294,975 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.98%) + 1,209,598,590 stalled-cycles-backend:u # 13.83% backend cycles idle (74.98%) + 23,579,297,009 instructions:u # 2.70 insn per cycle + # 0.05 stalled cycles per insn (75.01%) + 2.593662543 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.280108e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.704463e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.704463e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.095729 sec - 5,966,190,094 cycles # 2.842 GHz - 13,193,303,338 instructions # 2.21 insn per cycle - 2.108681044 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.912061e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.387522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.387522e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.691589 sec + 5,689,436,332 cycles:u # 3.305 GHz (74.91%) + 9,009,653 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.94%) + 1,010,811,881 stalled-cycles-backend:u # 17.77% backend cycles idle (74.94%) + 13,208,068,159 instructions:u # 2.32 insn per cycle + # 0.08 stalled cycles per insn (74.91%) + 1.724686172 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.635750e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.133552e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.133552e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.968095 sec - 5,545,071,065 cycles # 2.809 GHz - 12,102,600,869 instructions # 2.18 insn per cycle - 1.994477360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.941444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.175192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.175192e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.769335 sec - 5,356,977,226 cycles # 1.931 GHz - 9,382,516,259 instructions # 1.75 insn per cycle - 2.796011726 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index fe3b97e60f..2f14e3d02f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:11:26 +DATE: 2024-01-28_13:32:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.556891e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156782e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272118e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.523274 sec - 2,225,190,296 cycles # 2.920 GHz - 3,165,313,508 instructions # 1.42 insn per cycle - 0.819371043 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.619762e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.965495e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.019770e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.058889 sec + 3,239,800,555 cycles:u # 2.982 GHz (74.98%) + 10,954,603 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.21%) + 1,165,027,980 stalled-cycles-backend:u # 35.96% backend cycles idle (75.14%) + 3,002,700,856 instructions:u # 0.93 insn per cycle + # 0.39 stalled cycles per insn (74.90%) + 1.110182078 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.338474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.413557e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.413557e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.590948 sec - 13,903,491,071 cycles # 3.025 GHz - 35,849,286,374 instructions # 2.58 insn per cycle - 4.597844545 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.854206e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.937254e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.937254e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.850677 sec + 13,292,864,444 cycles:u # 3.425 GHz (74.88%) + 9,458,667 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) + 556,825,469 stalled-cycles-backend:u # 4.19% backend cycles idle (75.06%) + 35,759,647,817 instructions:u # 2.69 insn per cycle + # 0.02 stalled cycles per insn (75.07%) + 3.883732167 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.033937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.283303e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.283303e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.705956 sec - 8,203,204,697 cycles # 3.025 GHz - 21,906,275,135 instructions # 2.67 insn per cycle - 2.712679780 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.425692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.644208e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.644208e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.546004 sec + 8,703,098,620 cycles:u # 3.378 GHz (74.87%) + 9,150,233 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.85%) + 2,356,849,986 stalled-cycles-backend:u # 27.08% backend cycles idle (74.90%) + 21,880,814,968 instructions:u # 2.51 insn per cycle + # 0.11 stalled cycles per insn (75.06%) + 2.580733666 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.611264e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.105261e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.105261e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.979671 sec - 5,534,366,412 cycles # 2.793 GHz - 12,076,831,406 instructions # 2.18 insn per cycle - 1.986665078 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.667814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.109083e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.109083e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.749675 sec + 5,907,885,632 cycles:u # 3.319 GHz (74.91%) + 9,115,682 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.87%) + 2,250,445,505 stalled-cycles-backend:u # 38.09% backend cycles idle (74.87%) + 12,110,680,805 instructions:u # 2.05 insn per cycle + # 0.19 stalled cycles per insn (74.84%) + 1.784067388 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.207475e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.811915e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.811915e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.796434 sec - 5,140,536,788 cycles # 2.853 GHz - 11,141,735,276 instructions # 2.17 insn per cycle - 1.803318165 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.467257e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.761466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.761466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.454036 sec - 4,810,701,100 cycles # 1.956 GHz - 8,841,217,398 instructions # 1.84 insn per cycle - 2.460702395 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index a9f7002915..0f4e353ce0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:11:53 +DATE: 2024-01-28_13:32:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.566602e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157557e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274796e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.526133 sec - 2,269,985,735 cycles # 2.938 GHz - 3,227,777,224 instructions # 1.42 insn per cycle - 0.830677919 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.791937e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.927379e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.980472e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.054617 sec + 3,224,715,439 cycles:u # 2.971 GHz (74.96%) + 10,815,466 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.96%) + 1,178,327,260 stalled-cycles-backend:u # 36.54% backend cycles idle (74.90%) + 2,958,776,019 instructions:u # 0.92 insn per cycle + # 0.40 stalled cycles per insn (74.87%) + 1.109739959 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.558795e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.650149e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.650149e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.204720 sec - 12,508,246,419 cycles # 2.972 GHz - 35,732,210,405 instructions # 2.86 insn per cycle - 4.211518798 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.213590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319097e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.439517 sec + 11,845,463,703 cycles:u # 3.413 GHz (74.88%) + 8,526,830 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) + 10,258,082 stalled-cycles-backend:u # 0.09% backend cycles idle (75.09%) + 35,658,260,629 instructions:u # 3.01 insn per cycle + # 0.00 stalled cycles per insn (75.11%) + 3.472955154 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.001813e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.253552e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.253552e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.727801 sec - 8,032,446,977 cycles # 2.939 GHz - 21,259,935,359 instructions # 2.65 insn per cycle - 2.734713005 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.809965e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.068802e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.068802e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.356282 sec + 8,029,495,156 cycles:u # 3.365 GHz (74.89%) + 9,114,097 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) + 1,760,515,427 stalled-cycles-backend:u # 21.93% backend cycles idle (74.86%) + 21,259,836,160 instructions:u # 2.65 insn per cycle + # 0.08 stalled cycles per insn (75.00%) + 2.390346582 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.970376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.518278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.518278e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.863938 sec - 5,333,333,807 cycles # 2.853 GHz - 11,406,492,896 instructions # 2.14 insn per cycle - 1.870797127 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.910662e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.540729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.540729e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.502023 sec + 5,018,790,621 cycles:u # 3.276 GHz (74.94%) + 9,171,634 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.98%) + 305,812,820 stalled-cycles-backend:u # 6.09% backend cycles idle (74.72%) + 11,448,661,799 instructions:u # 2.28 insn per cycle + # 0.03 stalled cycles per insn (74.69%) + 1.535931380 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.390644e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.024648e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.024648e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.748196 sec - 4,995,883,688 cycles # 2.848 GHz - 10,598,736,895 instructions # 2.12 insn per cycle - 1.755165718 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.515233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.832010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.832010e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.434876 sec - 4,705,444,696 cycles # 1.931 GHz - 8,568,550,279 instructions # 1.82 insn per cycle - 2.441557167 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index f704509ce3..e868ff1e3b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_18:32:10 +DATE: 2024-01-28_13:10:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.371378e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.647289e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968239e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.481545 sec - 2,062,266,198 cycles # 2.917 GHz - 2,949,886,970 instructions # 1.43 insn per cycle - 0.779421283 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.867870e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.949186e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.114682e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.002212 sec + 3,146,193,514 cycles:u # 3.063 GHz (74.09%) + 10,775,596 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.96%) + 1,162,765,320 stalled-cycles-backend:u # 36.96% backend cycles idle (75.10%) + 2,821,273,239 instructions:u # 0.90 insn per cycle + # 0.41 stalled cycles per insn (75.15%) + 1.053695884 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.329952e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.407106e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.407106e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.585250 sec - 13,896,605,295 cycles # 3.027 GHz - 37,077,674,283 instructions # 2.67 insn per cycle - 4.593480894 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.983975e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.073711e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.073711e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.653959 sec + 12,674,685,102 cycles:u # 3.445 GHz (75.00%) + 6,976,259 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) + 10,395,979 stalled-cycles-backend:u # 0.08% backend cycles idle (75.00%) + 37,069,583,951 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 3.681563485 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.194608e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.645421e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.645421e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.104955 sec - 6,163,876,669 cycles # 2.922 GHz - 15,212,935,053 instructions # 2.47 insn per cycle - 2.119735474 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.084246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.484646e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.484646e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.863981 sec + 6,409,143,938 cycles:u # 3.392 GHz (74.92%) + 6,864,416 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.03%) + 2,243,634,732 stalled-cycles-backend:u # 35.01% backend cycles idle (75.03%) + 15,212,303,834 instructions:u # 2.37 insn per cycle + # 0.15 stalled cycles per insn (75.03%) + 1.893254518 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.463230e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088896e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088896e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.193160 sec - 3,445,241,392 cycles # 2.874 GHz - 7,715,704,867 instructions # 2.24 insn per cycle - 1.208683669 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.220620e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.377299e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.377299e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.999080 sec + 3,381,583,250 cycles:u # 3.300 GHz (74.93%) + 6,760,789 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.02%) + 924,609,270 stalled-cycles-backend:u # 27.34% backend cycles idle (75.02%) + 7,662,389,776 instructions:u # 2.27 insn per cycle + # 0.12 stalled cycles per insn (75.03%) + 1.028114549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205353e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205353e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.101345 sec - 3,175,112,198 cycles # 2.868 GHz - 7,109,521,586 instructions # 2.24 insn per cycle - 1.114551636 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.234175e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.048165e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.048165e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.535385 sec - 2,985,815,567 cycles # 1.938 GHz - 5,764,090,445 instructions # 1.93 insn per cycle - 1.547903228 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 6f10c4e596..63d5e71b58 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:22:22 +DATE: 2024-01-28_13:46:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.965218e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.420236e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.420236e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.672742 sec - 2,699,076,990 cycles # 2.952 GHz - 4,122,510,277 instructions # 1.53 insn per cycle - 0.972058483 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.468362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.052225e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.052225e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.151346 sec + 3,578,976,244 cycles:u # 3.030 GHz (74.85%) + 21,174,126 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) + 1,154,032,876 stalled-cycles-backend:u # 32.24% backend cycles idle (75.02%) + 3,915,654,276 instructions:u # 1.09 insn per cycle + # 0.29 stalled cycles per insn (74.86%) + 1.202695970 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.308184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.383926e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.383926e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.671374 sec - 14,083,951,451 cycles # 3.011 GHz - 37,120,772,584 instructions # 2.64 insn per cycle - 4.679096629 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.973829e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.065181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.065181e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.706344 sec + 12,727,386,308 cycles:u # 3.407 GHz (74.96%) + 6,881,823 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) + 22,282,019 stalled-cycles-backend:u # 0.18% backend cycles idle (74.97%) + 37,158,211,527 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 3.739232598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.315538e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.768348e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.768348e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.105430 sec - 6,354,554,621 cycles # 3.011 GHz - 15,492,019,285 instructions # 2.44 insn per cycle - 2.112526903 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.050687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.438614e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.438614e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.919778 sec + 6,485,492,313 cycles:u # 3.325 GHz (74.99%) + 7,563,911 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.99%) + 2,203,791,666 stalled-cycles-backend:u # 33.98% backend cycles idle (74.99%) + 15,490,081,510 instructions:u # 2.39 insn per cycle + # 0.14 stalled cycles per insn (75.02%) + 1.954573380 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.249872e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.061367e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061367e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.265664 sec - 3,645,033,176 cycles # 2.866 GHz - 7,953,463,690 instructions # 2.18 insn per cycle - 1.273185269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.208801e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.362492e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.362492e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.052643 sec + 3,440,201,075 cycles:u # 3.174 GHz (75.04%) + 6,842,051 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.91%) + 935,859,186 stalled-cycles-backend:u # 27.20% backend cycles idle (74.91%) + 7,927,062,593 instructions:u # 2.30 insn per cycle + # 0.12 stalled cycles per insn (74.96%) + 1.087632583 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.008124e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175608e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.175608e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.171580 sec - 3,374,627,804 cycles # 2.865 GHz - 7,347,172,592 instructions # 2.18 insn per cycle - 1.179048762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.385113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.212336e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.212336e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.552093 sec - 3,187,803,861 cycles # 2.046 GHz - 6,021,486,201 instructions # 1.89 insn per cycle - 1.559571454 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 14a879576f..45b49ea418 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:35:51 +DATE: 2024-01-28_13:56:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.393831e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.627983e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.948882e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.563905 sec - 2,307,075,319 cycles # 2.940 GHz - 3,388,511,891 instructions # 1.47 insn per cycle - 0.842336969 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.259857e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.954473e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.117418e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.005687 sec + 3,069,314,095 cycles:u # 2.975 GHz (75.12%) + 10,688,770 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.21%) + 1,145,250,134 stalled-cycles-backend:u # 37.31% backend cycles idle (75.26%) + 2,907,743,689 instructions:u # 0.95 insn per cycle + # 0.39 stalled cycles per insn (75.15%) + 1.053740664 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.310833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.387688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.387688e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.679354 sec - 14,057,950,577 cycles # 3.001 GHz - 37,107,834,585 instructions # 2.64 insn per cycle - 4.685514940 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.984371e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.073750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.073750e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.654664 sec + 12,668,038,640 cycles:u # 3.442 GHz (75.00%) + 6,895,601 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.00%) + 10,315,591 stalled-cycles-backend:u # 0.08% backend cycles idle (75.01%) + 37,055,294,095 instructions:u # 2.93 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 3.682293734 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.376525e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.841873e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.841873e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 -TOTAL : 2.090400 sec - 6,325,242,283 cycles # 3.019 GHz - 15,223,298,660 instructions # 2.41 insn per cycle - 2.096577762 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.085800e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.485279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.485279e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.863877 sec + 6,385,234,513 cycles:u # 3.379 GHz (75.02%) + 6,627,854 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.02%) + 2,211,355,000 stalled-cycles-backend:u # 34.63% backend cycles idle (75.02%) + 15,199,909,839 instructions:u # 2.38 insn per cycle + # 0.15 stalled cycles per insn (75.03%) + 1.891750968 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.372411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.078800e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.078800e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.259927 sec - 3,617,133,063 cycles # 2.859 GHz - 7,699,481,453 instructions # 2.13 insn per cycle - 1.266476529 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.222307e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379467e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 0.999380 sec + 3,374,682,598 cycles:u # 3.292 GHz (74.90%) + 6,664,818 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.03%) + 923,960,136 stalled-cycles-backend:u # 27.38% backend cycles idle (75.03%) + 7,662,101,331 instructions:u # 2.27 insn per cycle + # 0.12 stalled cycles per insn (75.04%) + 1.027286724 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.024833e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.196928e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.196928e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.164722 sec - 3,345,597,755 cycles # 2.860 GHz - 7,059,028,825 instructions # 2.11 insn per cycle - 1.171001818 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.526851e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.397649e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.397649e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.535907 sec - 3,151,943,893 cycles # 2.047 GHz - 5,714,706,075 instructions # 1.81 insn per cycle - 1.542090891 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index c7f0d3b000..269f84a482 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:32:27 +DATE: 2024-01-28_13:54:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.419343e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.634429e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.957416e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.510130 sec - 2,137,942,145 cycles # 2.934 GHz - 3,348,306,775 instructions # 1.57 insn per cycle - 0.788342368 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 50,492,933 cycles:u # 2.431 GHz (61.51%) + 48,157 stalled-cycles-frontend:u # 0.10% frontend cycles idle (61.51%) + 587,577 stalled-cycles-backend:u # 1.16% backend cycles idle (61.51%) + 46,402,407 instructions:u # 0.92 insn per cycle + # 0.01 stalled cycles per insn (63.40%) + 0.021670098 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.320811e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.398671e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.398671e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.604425 sec - 13,892,293,847 cycles # 3.014 GHz - 37,077,656,533 instructions # 2.67 insn per cycle - 4.610650729 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted + 39,894,533 cycles:u # 1.933 GHz (61.27%) + 104,234 stalled-cycles-frontend:u # 0.26% frontend cycles idle (61.27%) + 371,226 stalled-cycles-backend:u # 0.93% backend cycles idle (56.66%) + 48,727,655 instructions:u # 1.22 insn per cycle + # 0.01 stalled cycles per insn (76.02%) + 0.021936465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.374425e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.841836e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.841836e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.035974 sec - 6,175,209,269 cycles # 3.027 GHz - 15,211,533,732 instructions # 2.46 insn per cycle - 2.042362317 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.409429e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081523e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.081523e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.200285 sec - 3,444,552,271 cycles # 2.857 GHz - 7,714,694,129 instructions # 2.24 insn per cycle - 1.206466922 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted + 55,699,986 cycles:u # 2.705 GHz (61.18%) + 44,852 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.19%) + 642,472 stalled-cycles-backend:u # 1.15% backend cycles idle (61.19%) + 41,318,724 instructions:u # 0.74 insn per cycle + # 0.02 stalled cycles per insn (63.02%) + 0.021877861 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.031458e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204817e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.204817e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.101185 sec - 3,173,622,936 cycles # 2.869 GHz - 7,108,508,585 instructions # 2.24 insn per cycle - 1.107582757 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted + 55,921,394 cycles:u # 2.729 GHz (60.99%) + 45,186 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.00%) + 616,775 stalled-cycles-backend:u # 1.10% backend cycles idle (61.00%) + 41,283,104 instructions:u # 0.74 insn per cycle + # 0.01 stalled cycles per insn (62.83%) + 0.021782505 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.603863e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.500334e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.500334e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.465047 sec - 2,989,253,718 cycles # 2.034 GHz - 5,763,096,085 instructions # 1.93 insn per cycle - 1.471313886 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index efbd8ab8d1..802f24068e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:29:07 +DATE: 2024-01-28_13:52:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.782075e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.635958e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.962133e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.616379 sec - 2,457,237,369 cycles # 2.941 GHz - 3,790,082,335 instructions # 1.54 insn per cycle - 0.895160860 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.199018e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.934663e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.097479e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.125978 sec + 3,517,472,024 cycles:u # 3.046 GHz (75.08%) + 21,795,711 stalled-cycles-frontend:u # 0.62% frontend cycles idle (75.13%) + 1,145,098,111 stalled-cycles-backend:u # 32.55% backend cycles idle (75.10%) + 3,784,048,433 instructions:u # 1.08 insn per cycle + # 0.30 stalled cycles per insn (75.00%) + 1.174336253 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.321377e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.398534e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.398534e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.603077 sec - 13,891,133,529 cycles # 3.015 GHz - 37,077,670,179 instructions # 2.67 insn per cycle - 4.609272116 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.981458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.070741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.070741e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.658653 sec + 12,683,469,123 cycles:u # 3.443 GHz (75.00%) + 6,818,041 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) + 10,612,246 stalled-cycles-backend:u # 0.08% backend cycles idle (75.03%) + 37,048,878,080 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.686174764 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.362616e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.828214e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.828214e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.039889 sec - 6,158,523,818 cycles # 3.012 GHz - 15,211,006,796 instructions # 2.47 insn per cycle - 2.046105279 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.083752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.480232e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.480232e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.864743 sec + 6,386,352,954 cycles:u # 3.378 GHz (75.03%) + 6,620,904 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.03%) + 2,207,737,284 stalled-cycles-backend:u # 34.57% backend cycles idle (75.04%) + 15,202,737,268 instructions:u # 2.38 insn per cycle + # 0.15 stalled cycles per insn (75.04%) + 1.892656081 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.276982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.065857e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.065857e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.216623 sec - 3,449,112,422 cycles # 2.826 GHz - 7,715,526,788 instructions # 2.24 insn per cycle - 1.222716671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.221796e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.378717e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.378717e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.000149 sec + 3,377,486,531 cycles:u # 3.292 GHz (74.84%) + 6,684,712 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.05%) + 924,062,070 stalled-cycles-backend:u # 27.36% backend cycles idle (75.05%) + 7,659,777,132 instructions:u # 2.27 insn per cycle + # 0.12 stalled cycles per insn (75.06%) + 1.027697884 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.011316e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180119e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180119e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.123119 sec - 3,188,964,800 cycles # 2.827 GHz - 7,109,363,220 instructions # 2.23 insn per cycle - 1.129107814 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.275437e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.094175e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.094175e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.528710 sec - 2,977,974,799 cycles # 1.942 GHz - 5,762,709,816 instructions # 1.94 insn per cycle - 1.534960548 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index b3cfffed0a..042e3de501 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_18:32:34 +DATE: 2024-01-28_13:10:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.432312e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.695963e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.031521e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.479202 sec - 2,063,006,228 cycles # 2.917 GHz - 2,917,318,985 instructions # 1.41 insn per cycle - 0.776497427 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.278205e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.122561e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.307625e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.000858 sec + 3,085,382,162 cycles:u # 3.007 GHz (75.37%) + 10,612,277 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.18%) + 1,158,398,012 stalled-cycles-backend:u # 37.54% backend cycles idle (75.07%) + 2,777,468,294 instructions:u # 0.90 insn per cycle + # 0.42 stalled cycles per insn (75.12%) + 1.053344191 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.339008e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.417276e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.417276e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.568333 sec - 13,810,912,026 cycles # 3.020 GHz - 37,479,722,319 instructions # 2.71 insn per cycle - 4.576499144 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.960614e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.049279e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.049279e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.682401 sec + 12,779,950,462 cycles:u # 3.447 GHz (74.97%) + 7,147,335 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 13,342,030 stalled-cycles-backend:u # 0.10% backend cycles idle (74.97%) + 37,437,661,838 instructions:u # 2.93 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 3.710552831 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.912563e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.481847e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.481847e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.857535 sec - 5,469,534,249 cycles # 2.935 GHz - 15,245,119,579 instructions # 2.79 insn per cycle - 1.874458797 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.316599e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.898743e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.898743e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.573407 sec + 5,370,146,557 cycles:u # 3.358 GHz (74.99%) + 7,049,522 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.99%) + 1,296,967,325 stalled-cycles-backend:u # 24.15% backend cycles idle (74.99%) + 15,246,844,666 instructions:u # 2.84 insn per cycle + # 0.09 stalled cycles per insn (74.77%) + 1.602465611 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.695444e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.379016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.379016e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.651281 sec - 4,710,630,291 cycles # 2.843 GHz - 9,850,049,828 instructions # 2.09 insn per cycle - 1.667735017 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 8.871601e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.683658e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.683658e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.322244 sec + 4,487,840,490 cycles:u # 3.330 GHz (75.03%) + 6,293,982 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.07%) + 1,662,737,954 stalled-cycles-backend:u # 37.05% backend cycles idle (75.08%) + 9,797,942,949 instructions:u # 2.18 insn per cycle + # 0.17 stalled cycles per insn (75.10%) + 1.351229383 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.060982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.844113e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.844113e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.570550 sec - 4,488,154,546 cycles # 2.847 GHz - 9,201,957,327 instructions # 2.05 insn per cycle - 1.584001028 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186428369954 +Relative difference = 1.7604478492421832e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.363474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.970097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.970097e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.734074 sec - 3,450,778,166 cycles # 1.984 GHz - 6,874,943,117 instructions # 1.99 insn per cycle - 1.747504108 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183217635378 -Relative difference = 1.5859655131013432e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index d13cada649..bf507682ad 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:12:19 +DATE: 2024-01-28_13:33:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.400890e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.630288e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.950964e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.479613 sec - 2,055,675,343 cycles # 2.919 GHz - 2,963,853,527 instructions # 1.44 insn per cycle - 0.762433330 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.113239e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.954105e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.119229e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.004793 sec + 3,157,964,934 cycles:u # 3.065 GHz (74.05%) + 10,832,495 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.62%) + 1,159,454,099 stalled-cycles-backend:u # 36.72% backend cycles idle (74.91%) + 2,839,370,366 instructions:u # 0.90 insn per cycle + # 0.41 stalled cycles per insn (75.16%) + 1.054666781 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.584030e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.679206e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.679206e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.144569 sec - 12,413,620,811 cycles # 2.992 GHz - 34,216,317,298 instructions # 2.76 insn per cycle - 4.151106330 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.217981e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322339e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322339e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.400341 sec + 11,783,062,686 cycles:u # 3.439 GHz (74.97%) + 6,747,931 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.02%) + 1,691,071,194 stalled-cycles-backend:u # 14.35% backend cycles idle (75.02%) + 34,217,886,896 instructions:u # 2.90 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 3.428596526 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.211764e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.840756e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.840756e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.773327 sec - 5,361,205,653 cycles # 3.014 GHz - 14,586,960,101 instructions # 2.72 insn per cycle - 1.779952229 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.202361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.763493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.763493e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.597461 sec + 5,462,019,287 cycles:u # 3.364 GHz (74.97%) + 7,524,632 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.88%) + 2,037,874,394 stalled-cycles-backend:u # 37.31% backend cycles idle (74.88%) + 14,603,291,454 instructions:u # 2.67 insn per cycle + # 0.14 stalled cycles per insn (74.92%) + 1.627778417 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192580919713 -Relative difference = 1.2721291123071246e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198769558221 +Relative difference = 6.06481491495597e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.860060e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.831807e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.831807e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.420044 sec - 4,064,581,884 cycles # 2.853 GHz - 9,088,361,630 instructions # 2.24 insn per cycle - 1.426753997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.440783e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.035053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.035053e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.252142 sec + 4,260,468,182 cycles:u # 3.332 GHz (75.02%) + 6,952,166 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.98%) + 1,642,503,426 stalled-cycles-backend:u # 38.55% backend cycles idle (74.98%) + 9,034,279,045 instructions:u # 2.12 insn per cycle + # 0.18 stalled cycles per insn (74.98%) + 1.281904591 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.459229e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.605357e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.605357e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.325444 sec - 3,806,668,537 cycles # 2.860 GHz - 8,440,473,835 instructions # 2.22 insn per cycle - 1.332061166 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186752004549 +Relative difference = 1.6009291367898262e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.862576e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.375949e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.375949e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.874251 sec - 3,732,915,412 cycles # 1.986 GHz - 7,571,540,074 instructions # 2.03 insn per cycle - 1.881121521 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183350348845 -Relative difference = 1.6513796936156652e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index f03dfd549c..13812e3523 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_19:12:43 +DATE: 2024-01-28_13:33:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.450874e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.693366e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.019706e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.479144 sec - 2,065,328,354 cycles # 2.934 GHz - 2,937,355,449 instructions # 1.42 insn per cycle - 0.762525562 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.130970e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126031e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.311298e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.003536 sec + 3,039,083,138 cycles:u # 2.955 GHz (75.12%) + 10,598,068 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.18%) + 1,141,275,534 stalled-cycles-backend:u # 37.55% backend cycles idle (75.14%) + 2,935,198,243 instructions:u # 0.97 insn per cycle + # 0.39 stalled cycles per insn (75.10%) + 1.053839204 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.656538e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.759877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.759877e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.035436 sec - 11,950,269,072 cycles # 2.958 GHz - 35,406,289,389 instructions # 2.96 insn per cycle - 4.042104786 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.441941e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.561536e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.561536e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.188009 sec + 11,063,045,419 cycles:u # 3.441 GHz (74.77%) + 7,294,655 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.95%) + 249,077,227 stalled-cycles-backend:u # 2.25% backend cycles idle (75.08%) + 35,391,614,020 instructions:u # 3.20 insn per cycle + # 0.01 stalled cycles per insn (75.13%) + 3.217485797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.593818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.308688e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.308688e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.677045 sec - 5,074,113,057 cycles # 3.016 GHz - 14,044,617,920 instructions # 2.77 insn per cycle - 1.683531971 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.813144e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.487452e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.487452e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.483390 sec + 5,089,681,195 cycles:u # 3.371 GHz (74.69%) + 7,194,060 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.89%) + 1,337,761,295 stalled-cycles-backend:u # 26.28% backend cycles idle (75.10%) + 14,066,053,324 instructions:u # 2.76 insn per cycle + # 0.10 stalled cycles per insn (75.10%) + 1.513530765 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192554144189 -Relative difference = 1.2589315209891237e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198892958462 +Relative difference = 5.4565783974899003e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.984398e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.987290e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.987290e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.399748 sec - 4,006,669,403 cycles # 2.852 GHz - 8,630,101,532 instructions # 2.15 insn per cycle - 1.406553471 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.018818e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.125746e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.125746e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.170469 sec + 3,966,855,118 cycles:u # 3.314 GHz (74.73%) + 6,278,145 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.66%) + 1,453,930,293 stalled-cycles-backend:u # 36.65% backend cycles idle (74.78%) + 8,622,552,846 instructions:u # 2.17 insn per cycle + # 0.17 stalled cycles per insn (75.11%) + 1.200791899 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.705281e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.911157e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.911157e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.292210 sec - 3,693,940,962 cycles # 2.847 GHz - 8,100,406,211 instructions # 2.19 insn per cycle - 1.298750765 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186836987734 +Relative difference = 1.559041129563128e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.984579e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.516367e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.516367e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.839178 sec - 3,581,181,804 cycles # 1.942 GHz - 7,373,471,148 instructions # 2.06 insn per cycle - 1.845549534 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183569209650 -Relative difference = 1.7592557106041962e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index f40e579459..0930c22334 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_18:32:59 +DATE: 2024-01-28_13:11:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.534144e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154422e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270192e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.530968 sec - 2,216,247,658 cycles # 2.886 GHz - 3,159,617,774 instructions # 1.43 insn per cycle - 0.841223938 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.814549e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.027041e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.083024e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.055290 sec + 3,235,603,215 cycles:u # 2.990 GHz (74.57%) + 10,735,758 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.04%) + 1,166,700,197 stalled-cycles-backend:u # 36.06% backend cycles idle (75.03%) + 2,957,826,988 instructions:u # 0.91 insn per cycle + # 0.39 stalled cycles per insn (74.81%) + 1.108487534 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.131579e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.194174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.194174e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.026343 sec - 15,226,407,193 cycles # 3.026 GHz - 39,292,878,837 instructions # 2.58 insn per cycle - 5.034989309 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.475956e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.539042e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.539042e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.410605 sec + 15,235,239,464 cycles:u # 3.432 GHz (74.95%) + 8,950,019 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) + 122,481,296 stalled-cycles-backend:u # 0.80% backend cycles idle (74.97%) + 39,339,585,883 instructions:u # 2.58 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 4.441724700 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.716282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.924245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.924245e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.928277 sec - 8,842,003,385 cycles # 3.013 GHz - 24,093,000,203 instructions # 2.72 insn per cycle - 2.944933034 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.568089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.801322e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.801322e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.470309 sec + 8,452,701,924 cycles:u # 3.382 GHz (74.89%) + 8,675,090 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.01%) + 888,546,333 stalled-cycles-backend:u # 10.51% backend cycles idle (75.03%) + 24,040,285,320 instructions:u # 2.84 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 2.502796253 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.756859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.271313e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.271313e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.927658 sec - 5,479,999,914 cycles # 2.834 GHz - 11,449,005,560 instructions # 2.09 insn per cycle - 1.943367878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.862178e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.488513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.488513e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.509496 sec + 5,043,612,791 cycles:u # 3.278 GHz (75.04%) + 8,437,371 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) + 464,861,661 stalled-cycles-backend:u # 9.22% backend cycles idle (75.05%) + 11,420,957,693 instructions:u # 2.26 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 1.542209096 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.680868e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.379667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.379667e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.676141 sec - 4,795,916,206 cycles # 2.851 GHz - 10,317,620,181 instructions # 2.15 insn per cycle - 1.692913460 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.384618e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.665609e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.665609e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.498959 sec - 4,846,181,062 cycles # 1.935 GHz - 7,366,355,573 instructions # 1.52 insn per cycle - 2.513668299 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index d579c4f0fa..a8013babf3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-27_18:33:26 +DATE: 2024-01-28_13:11:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.549566e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157441e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.275364e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529385 sec - 2,259,213,903 cycles # 2.923 GHz - 3,197,943,416 instructions # 1.42 insn per cycle - 0.839811434 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.791588e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.919510e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.972449e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.055156 sec + 3,238,130,925 cycles:u # 2.989 GHz (75.02%) + 10,839,877 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.96%) + 1,162,562,236 stalled-cycles-backend:u # 35.90% backend cycles idle (74.84%) + 3,024,381,388 instructions:u # 0.93 insn per cycle + # 0.38 stalled cycles per insn (74.94%) + 1.105185059 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.148050e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.211542e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.211542e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.988633 sec - 15,083,182,010 cycles # 3.021 GHz - 40,116,301,091 instructions # 2.66 insn per cycle - 4.997454221 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.428646e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.488494e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.488494e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.492644 sec + 15,561,820,164 cycles:u # 3.440 GHz (74.89%) + 9,743,080 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) + 22,856,560 stalled-cycles-backend:u # 0.15% backend cycles idle (74.99%) + 40,034,760,935 instructions:u # 2.57 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 4.526018788 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.806171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.025471e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.025471e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.859908 sec - 8,680,023,988 cycles # 3.031 GHz - 23,533,588,214 instructions # 2.71 insn per cycle - 2.872184850 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.507612e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.734439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.734439e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.501251 sec + 8,526,186,614 cycles:u # 3.370 GHz (75.02%) + 9,200,816 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.02%) + 688,799,308 stalled-cycles-backend:u # 8.08% backend cycles idle (75.03%) + 23,521,583,317 instructions:u # 2.76 insn per cycle + # 0.03 stalled cycles per insn (74.89%) + 2.534137739 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.076921e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.467987e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.467987e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.172334 sec - 6,181,256,235 cycles # 2.838 GHz - 13,103,124,082 instructions # 2.12 insn per cycle - 2.189317684 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.849652e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.322321e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.322321e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.706554 sec + 5,746,389,955 cycles:u # 3.311 GHz (74.90%) + 8,537,946 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.11%) + 832,559,663 stalled-cycles-backend:u # 14.49% backend cycles idle (75.12%) + 13,045,445,119 instructions:u # 2.27 insn per cycle + # 0.06 stalled cycles per insn (75.12%) + 1.739374039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.497425e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.959043e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.959043e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.013879 sec - 5,750,603,441 cycles # 2.847 GHz - 12,210,900,339 instructions # 2.12 insn per cycle - 2.027278919 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.050220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.288891e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.288891e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.695806 sec - 5,252,673,350 cycles # 1.944 GHz - 8,448,932,017 instructions # 1.61 insn per cycle - 2.713329625 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 7a5cc5c1da..03468ae98d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_18:33:54 +DATE: 2024-01-28_13:11:39 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.710988e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043633e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058136e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.465052 sec - 2,015,443,684 cycles # 2.917 GHz - 2,859,484,489 instructions # 1.42 insn per cycle - 0.769131816 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.909982e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079054e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.084818e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.529315 sec + 1,559,559,865 cycles:u # 2.820 GHz (76.06%) + 8,418,204 stalled-cycles-frontend:u # 0.54% frontend cycles idle (75.56%) + 291,021,945 stalled-cycles-backend:u # 18.66% backend cycles idle (75.65%) + 1,867,270,999 instructions:u # 1.20 insn per cycle + # 0.16 stalled cycles per insn (75.83%) + 0.572500015 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.082561e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.322561e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336982e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.605622 sec - 2,468,411,647 cycles # 2.919 GHz - 3,739,161,994 instructions # 1.51 insn per cycle - 0.904488043 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.605181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.840624e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.846217e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.133892 sec + 3,465,794,559 cycles:u # 2.981 GHz (75.14%) + 21,055,335 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.22%) + 852,829,869 stalled-cycles-backend:u # 24.61% backend cycles idle (75.30%) + 3,181,983,608 instructions:u # 0.92 insn per cycle + # 0.27 stalled cycles per insn (75.22%) + 1.184714650 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.537137e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.550087e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.550087e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.481814 sec - 19,510,102,704 cycles # 3.008 GHz - 57,920,963,917 instructions # 2.97 insn per cycle - 6.498270794 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.953719e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.966067e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966067e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.570588 sec + 19,578,003,721 cycles:u # 3.501 GHz (74.96%) + 2,464,679 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 3,426,051,752 stalled-cycles-backend:u # 17.50% backend cycles idle (74.96%) + 57,962,345,246 instructions:u # 2.96 insn per cycle + # 0.06 stalled cycles per insn (74.92%) + 5.594468757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.882301e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.929920e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.929920e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.379513 sec - 10,206,931,044 cycles # 3.016 GHz - 29,943,639,229 instructions # 2.93 insn per cycle - 3.392509806 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.034955e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.085881e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.085881e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.738430 sec + 9,645,335,215 cycles:u # 3.494 GHz (74.88%) + 2,693,840 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) + 2,377,670,357 stalled-cycles-backend:u # 24.65% backend cycles idle (75.08%) + 29,965,191,092 instructions:u # 3.11 insn per cycle + # 0.08 stalled cycles per insn (75.08%) + 2.769354637 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.552526e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.739093e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.739093e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.738955 sec - 4,925,861,013 cycles # 2.825 GHz - 11,211,258,499 instructions # 2.28 insn per cycle - 1.750398674 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.230860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.251937e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.251937e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.354434 sec + 4,773,116,973 cycles:u # 3.469 GHz (75.00%) + 2,265,734 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.00%) + 1,485,397,434 stalled-cycles-backend:u # 31.12% backend cycles idle (75.01%) + 11,220,229,221 instructions:u # 2.35 insn per cycle + # 0.13 stalled cycles per insn (75.01%) + 1.379111913 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.094289e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.118334e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.118334e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.520671 sec - 4,316,683,210 cycles # 2.832 GHz - 10,188,546,360 instructions # 2.36 insn per cycle - 1.533592671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.872940e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.000632e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.000632e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.107587 sec - 3,916,109,587 cycles # 1.854 GHz - 5,709,747,998 instructions # 1.46 insn per cycle - 2.121465703 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 26e40e50e6..ccdc768c3e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_19:22:46 +DATE: 2024-01-28_13:46:27 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.570600e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.729427e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.729427e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.497423 sec - 2,072,640,631 cycles # 2.921 GHz - 3,121,041,261 instructions # 1.51 insn per cycle - 0.767886427 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.491045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.013681e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.013681e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.559439 sec + 1,647,632,431 cycles:u # 2.822 GHz (75.24%) + 9,693,930 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.34%) + 268,139,323 stalled-cycles-backend:u # 16.27% backend cycles idle (75.81%) + 2,067,531,007 instructions:u # 1.25 insn per cycle + # 0.13 stalled cycles per insn (75.35%) + 0.606453683 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.687406e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.440630e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.440630e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.831609 sec - 3,177,887,444 cycles # 2.935 GHz - 5,073,989,018 instructions # 1.60 insn per cycle - 1.143844413 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.195881e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.673576e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.673576e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.261679 sec + 3,823,358,268 cycles:u # 2.940 GHz (74.73%) + 29,374,830 stalled-cycles-frontend:u # 0.77% frontend cycles idle (74.85%) + 865,565,211 stalled-cycles-backend:u # 22.64% backend cycles idle (75.10%) + 3,922,189,768 instructions:u # 1.03 insn per cycle + # 0.22 stalled cycles per insn (75.36%) + 1.320853639 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.536676e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.549688e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.549688e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.489507 sec - 19,599,964,283 cycles # 3.018 GHz - 57,925,734,957 instructions # 2.96 insn per cycle - 6.495090644 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.938839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951076e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951076e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.602477 sec + 19,673,576,110 cycles:u # 3.498 GHz (74.97%) + 2,098,950 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 3,431,015,726 stalled-cycles-backend:u # 17.44% backend cycles idle (74.97%) + 57,895,900,388 instructions:u # 2.94 insn per cycle + # 0.06 stalled cycles per insn (74.98%) + 5.626300208 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.771475e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.819214e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.819214e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.464872 sec - 10,246,076,982 cycles # 2.953 GHz - 29,991,327,277 instructions # 2.93 insn per cycle - 3.470312959 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.032220e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.083502e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.083502e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.743979 sec + 9,644,824,281 cycles:u # 3.487 GHz (74.86%) + 2,560,504 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.92%) + 2,342,809,521 stalled-cycles-backend:u # 24.29% backend cycles idle (75.05%) + 29,999,847,851 instructions:u # 3.11 insn per cycle + # 0.08 stalled cycles per insn (75.13%) + 2.769342677 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.481732e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.670422e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.670422e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.761036 sec - 4,976,542,493 cycles # 2.821 GHz - 11,262,030,802 instructions # 2.26 insn per cycle - 1.766514699 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.237326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.258636e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.258636e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.351956 sec + 4,773,967,453 cycles:u # 3.475 GHz (75.05%) + 2,106,133 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.97%) + 1,427,762,785 stalled-cycles-backend:u # 29.91% backend cycles idle (74.97%) + 11,253,219,332 instructions:u # 2.36 insn per cycle + # 0.13 stalled cycles per insn (74.97%) + 1.377175051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.087761e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112018e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112018e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.537342 sec - 4,353,684,327 cycles # 2.825 GHz - 10,235,838,456 instructions # 2.35 insn per cycle - 1.542582778 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.820472e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.946042e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.946042e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.130009 sec - 3,961,323,367 cycles # 1.856 GHz - 5,747,731,832 instructions # 1.45 insn per cycle - 2.135393810 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 82f8f0c137..817e65b1a1 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_18:34:24 +DATE: 2024-01-28_13:11:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.648982e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.034609e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.048969e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.469943 sec - 1,939,557,622 cycles # 2.824 GHz - 2,799,432,511 instructions # 1.44 insn per cycle - 0.759940517 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.918580e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.072340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.077515e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.526241 sec + 1,548,968,841 cycles:u # 2.807 GHz (75.22%) + 8,147,567 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.22%) + 291,697,457 stalled-cycles-backend:u # 18.83% backend cycles idle (75.50%) + 1,830,421,358 instructions:u # 1.18 insn per cycle + # 0.16 stalled cycles per insn (75.63%) + 0.571689310 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.073086e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309469e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323399e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603676 sec - 2,470,216,480 cycles # 2.932 GHz - 3,659,633,330 instructions # 1.48 insn per cycle - 0.901834911 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.536043e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.811270e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.816262e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.126313 sec + 3,458,121,082 cycles:u # 2.996 GHz (74.98%) + 21,160,316 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.19%) + 854,292,971 stalled-cycles-backend:u # 24.70% backend cycles idle (75.11%) + 3,174,894,361 instructions:u # 0.92 insn per cycle + # 0.27 stalled cycles per insn (75.06%) + 1.176797069 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.558200e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.571366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.571366e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.427967 sec - 19,465,588,306 cycles # 3.027 GHz - 57,749,577,513 instructions # 2.97 insn per cycle - 6.435332564 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.924382e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.936482e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.936482e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.625968 sec + 19,768,241,213 cycles:u # 3.501 GHz (74.93%) + 2,655,591 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 3,029,751,102 stalled-cycles-backend:u # 15.33% backend cycles idle (74.96%) + 57,778,170,108 instructions:u # 2.92 insn per cycle + # 0.05 stalled cycles per insn (75.03%) + 5.649495914 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.855358e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.902213e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.902213e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.397645 sec - 10,269,894,933 cycles # 3.019 GHz - 30,334,003,333 instructions # 2.95 insn per cycle - 3.409134445 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.953726e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.003371e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.003371e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.775191 sec + 9,776,032,539 cycles:u # 3.496 GHz (74.88%) + 2,389,136 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) + 2,294,856,841 stalled-cycles-backend:u # 23.47% backend cycles idle (74.89%) + 30,399,307,141 instructions:u # 3.11 insn per cycle + # 0.08 stalled cycles per insn (75.02%) + 2.800182646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.024480e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.196412e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.196412e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.839038 sec - 5,070,636,702 cycles # 2.750 GHz - 11,664,223,400 instructions # 2.30 insn per cycle - 1.852089106 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.198423e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.218357e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.218357e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.390357 sec + 4,931,396,987 cycles:u # 3.490 GHz (74.79%) + 2,134,488 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) + 1,682,834,199 stalled-cycles-backend:u # 34.12% backend cycles idle (75.09%) + 11,671,002,802 instructions:u # 2.37 insn per cycle + # 0.14 stalled cycles per insn (75.09%) + 1.426396900 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.018318e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039419e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.039419e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.632266 sec - 4,623,527,919 cycles # 2.824 GHz - 10,805,823,321 instructions # 2.34 insn per cycle - 1.647095967 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.787268e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.911321e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.911321e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.129389 sec - 3,963,647,711 cycles # 1.858 GHz - 5,999,337,334 instructions # 1.51 insn per cycle - 2.148549810 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index dbb3bf021d..abb9800c5e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_18:34:54 +DATE: 2024-01-28_13:12:18 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.479786e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.361442e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.464103e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.448422 sec - 1,932,993,574 cycles # 2.916 GHz - 2,719,457,311 instructions # 1.41 insn per cycle - 0.741279046 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.363151e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.537668e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.622058e+06 ) sec^-1 +MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 +TOTAL : 0.462168 sec + 1,368,322,800 cycles:u # 2.802 GHz (73.26%) + 8,194,242 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.37%) + 276,189,769 stalled-cycles-backend:u # 20.18% backend cycles idle (76.22%) + 1,676,657,905 instructions:u # 1.23 insn per cycle + # 0.16 stalled cycles per insn (75.66%) + 0.509375607 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.217603e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.410482e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.500434e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.496288 sec - 2,128,992,550 cycles # 2.942 GHz - 3,069,969,108 instructions # 1.44 insn per cycle - 0.782011398 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.327882e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.632793e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.638488e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 +TOTAL : 0.950410 sec + 2,876,862,724 cycles:u # 2.948 GHz (75.32%) + 21,132,083 stalled-cycles-frontend:u # 0.73% frontend cycles idle (75.41%) + 855,313,895 stalled-cycles-backend:u # 29.73% backend cycles idle (75.01%) + 2,764,167,270 instructions:u # 0.96 insn per cycle + # 0.31 stalled cycles per insn (75.09%) + 0.998013250 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.704875e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.719869e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.719869e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.082847 sec - 18,191,693,462 cycles # 2.990 GHz - 55,241,232,857 instructions # 3.04 insn per cycle - 6.090107876 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.253792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.269267e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.269267e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.056922 sec + 17,775,574,452 cycles:u # 3.501 GHz (74.95%) + 2,522,383 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 3,672,872,120 stalled-cycles-backend:u # 20.66% backend cycles idle (74.95%) + 55,277,772,068 instructions:u # 3.11 insn per cycle + # 0.07 stalled cycles per insn (74.97%) + 5.080122152 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.792245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.953433e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.953433e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.884910 sec - 5,695,859,688 cycles # 3.015 GHz - 16,129,136,780 instructions # 2.83 insn per cycle - 1.899358612 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.086183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.103464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.103464e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.529604 sec + 5,398,274,120 cycles:u # 3.481 GHz (74.73%) + 2,212,906 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.76%) + 1,663,514,226 stalled-cycles-backend:u # 30.82% backend cycles idle (75.00%) + 16,168,806,091 instructions:u # 3.00 insn per cycle + # 0.10 stalled cycles per insn (75.23%) + 1.554011097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.841113e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.909434e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.909434e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.911679 sec - 2,595,428,617 cycles # 2.832 GHz - 6,086,094,913 instructions # 2.34 insn per cycle - 0.926601135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.367753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.448576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.448576e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.713114 sec + 2,528,265,935 cycles:u # 3.443 GHz (74.95%) + 1,705,410 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.95%) + 824,805,734 stalled-cycles-backend:u # 32.62% backend cycles idle (74.96%) + 6,091,644,676 instructions:u # 2.41 insn per cycle + # 0.14 stalled cycles per insn (74.96%) + 0.737720577 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.076614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.164725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.164725e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.810804 sec - 2,297,569,115 cycles # 2.817 GHz - 5,552,672,846 instructions # 2.42 insn per cycle - 0.829591110 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.552666e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.602558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.602558e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.079184 sec - 2,022,265,688 cycles # 1.866 GHz - 3,286,565,352 instructions # 1.63 insn per cycle - 1.092911477 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 9e8745f87b..05cbe02364 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_19:23:16 +DATE: 2024-01-28_13:46:46 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.995078e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.153881e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.153881e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.457490 sec - 1,962,472,775 cycles # 2.931 GHz - 2,879,319,483 instructions # 1.47 insn per cycle - 0.727241404 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 3.264091e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.629857e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.629857e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 +TOTAL : 0.493262 sec + 1,427,000,492 cycles:u # 2.751 GHz (74.50%) + 11,062,734 stalled-cycles-frontend:u # 0.78% frontend cycles idle (74.75%) + 288,331,422 stalled-cycles-backend:u # 20.21% backend cycles idle (75.47%) + 1,941,450,911 instructions:u # 1.36 insn per cycle + # 0.15 stalled cycles per insn (75.42%) + 0.537422932 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.577440e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.549481e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.549481e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.647454 sec - 2,501,078,986 cycles # 2.832 GHz - 3,834,912,584 instructions # 1.53 insn per cycle - 0.940994508 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.123568e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.467201e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.467201e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 +TOTAL : 1.059350 sec + 3,243,958,406 cycles:u # 2.970 GHz (74.92%) + 29,201,045 stalled-cycles-frontend:u # 0.90% frontend cycles idle (75.16%) + 860,374,227 stalled-cycles-backend:u # 26.52% backend cycles idle (75.17%) + 3,453,925,859 instructions:u # 1.06 insn per cycle + # 0.25 stalled cycles per insn (75.42%) + 1.113397605 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.735414e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.750975e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.750975e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.016121 sec - 18,199,864,686 cycles # 3.023 GHz - 55,241,753,508 instructions # 3.04 insn per cycle - 6.021157634 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.235486e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.250776e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.250776e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.087470 sec + 17,871,733,886 cycles:u # 3.498 GHz (74.95%) + 2,549,807 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 3,700,745,689 stalled-cycles-backend:u # 20.71% backend cycles idle (74.96%) + 55,318,123,763 instructions:u # 3.10 insn per cycle + # 0.07 stalled cycles per insn (74.98%) + 5.111017597 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.783564e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.948787e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.948787e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.891930 sec - 5,721,918,716 cycles # 3.018 GHz - 16,176,155,673 instructions # 2.83 insn per cycle - 1.897169792 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.079389e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096484e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.542157 sec + 5,440,398,597 cycles:u # 3.480 GHz (74.93%) + 2,113,561 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) + 1,661,597,394 stalled-cycles-backend:u # 30.54% backend cycles idle (74.93%) + 16,214,117,343 instructions:u # 2.98 insn per cycle + # 0.10 stalled cycles per insn (74.94%) + 1.566969925 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.824815e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.894169e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.894169e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.924503 sec - 2,621,240,795 cycles # 2.822 GHz - 6,121,586,144 instructions # 2.34 insn per cycle - 0.929727468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.359009e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.439177e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.439177e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.718329 sec + 2,573,153,270 cycles:u # 3.478 GHz (74.59%) + 1,760,279 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.85%) + 822,041,230 stalled-cycles-backend:u # 31.95% backend cycles idle (75.14%) + 6,131,508,895 instructions:u # 2.38 insn per cycle + # 0.13 stalled cycles per insn (75.14%) + 0.743296450 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.077557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.168993e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.168993e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.815038 sec - 2,326,750,327 cycles # 2.840 GHz - 5,589,464,486 instructions # 2.40 insn per cycle - 0.820459860 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.563585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.614955e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.614955e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.075997 sec - 2,047,720,481 cycles # 1.896 GHz - 3,327,332,623 instructions # 1.62 insn per cycle - 1.081428956 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 10091edee9..8d47b19a57 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_18:35:19 +DATE: 2024-01-28_13:12:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.400668e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.203981e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.294099e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.449645 sec - 1,928,359,102 cycles # 2.901 GHz - 2,715,522,194 instructions # 1.41 insn per cycle - 0.732501667 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.530693e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.520372e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.597614e+06 ) sec^-1 +MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 +TOTAL : 0.461335 sec + 1,326,167,778 cycles:u # 2.729 GHz (74.59%) + 8,151,718 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.18%) + 278,720,879 stalled-cycles-backend:u # 21.02% backend cycles idle (75.49%) + 1,659,800,184 instructions:u # 1.25 insn per cycle + # 0.17 stalled cycles per insn (75.76%) + 0.503880984 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.220081e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.406699e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.488271e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.495964 sec - 2,117,808,383 cycles # 2.926 GHz - 3,044,007,413 instructions # 1.44 insn per cycle - 0.782283779 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.393837e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.690545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.695908e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 +TOTAL : 0.949511 sec + 2,908,553,453 cycles:u # 2.982 GHz (74.54%) + 20,937,653 stalled-cycles-frontend:u # 0.72% frontend cycles idle (75.32%) + 852,141,150 stalled-cycles-backend:u # 29.30% backend cycles idle (75.32%) + 2,753,093,077 instructions:u # 0.95 insn per cycle + # 0.31 stalled cycles per insn (75.36%) + 0.993929305 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669531526541 +Relative difference = 0.0005401805380429868 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.698857e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.713684e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.713684e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.092382 sec - 18,143,037,567 cycles # 2.976 GHz - 54,990,857,470 instructions # 3.03 insn per cycle - 6.099315799 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.237369e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.252608e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252608e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.082280 sec + 17,865,629,686 cycles:u # 3.501 GHz (74.92%) + 2,498,978 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.86%) + 2,991,351,501 stalled-cycles-backend:u # 16.74% backend cycles idle (74.94%) + 55,034,375,096 instructions:u # 3.08 insn per cycle + # 0.05 stalled cycles per insn (75.08%) + 5.105994221 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.027267e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.200862e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.200862e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.835907 sec - 5,542,210,713 cycles # 3.011 GHz - 16,222,683,207 instructions # 2.93 insn per cycle - 1.848191835 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.113934e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.132151e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.132151e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.491759 sec + 5,272,618,876 cycles:u # 3.485 GHz (74.88%) + 1,965,606 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.11%) + 1,542,389,812 stalled-cycles-backend:u # 29.25% backend cycles idle (75.15%) + 16,244,718,537 instructions:u # 3.08 insn per cycle + # 0.09 stalled cycles per insn (75.15%) + 1.515955960 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129863487235070 -Relative difference = 2.4679898241023883e-07 +Avg ME (F77/C++) = 1.4129857712652836 +Relative difference = 1.618803841657786e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.590968e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.642236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.642236e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.051037 sec - 2,983,610,010 cycles # 2.826 GHz - 6,707,954,290 instructions # 2.25 insn per cycle - 1.063017203 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.099762e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.163094e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.163094e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.801224 sec + 2,855,360,913 cycles:u # 3.472 GHz (74.79%) + 1,399,697 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.72%) + 808,361,645 stalled-cycles-backend:u # 28.31% backend cycles idle (74.72%) + 6,739,199,559 instructions:u # 2.36 insn per cycle + # 0.12 stalled cycles per insn (74.42%) + 0.825544357 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.752738e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.814984e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.814984e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.956581 sec - 2,712,532,749 cycles # 2.822 GHz - 6,222,719,641 instructions # 2.29 insn per cycle - 0.972410805 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.465208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.508714e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.508714e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.141622 sec - 2,158,724,317 cycles # 1.883 GHz - 3,642,405,179 instructions # 1.69 insn per cycle - 1.154915737 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index af01fc4dc2..6bc3c8b3b5 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_18:35:44 +DATE: 2024-01-28_13:12:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.670889e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038424e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052798e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.464949 sec - 2,022,736,340 cycles # 2.936 GHz - 2,867,853,283 instructions # 1.42 insn per cycle - 0.765717373 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.930611e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.079940e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.085510e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.525099 sec + 1,547,001,642 cycles:u # 2.809 GHz (74.40%) + 8,231,293 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.40%) + 291,649,341 stalled-cycles-backend:u # 18.85% backend cycles idle (75.29%) + 1,834,262,540 instructions:u # 1.19 insn per cycle + # 0.16 stalled cycles per insn (75.20%) + 0.569315068 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.075008e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.311358e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325319e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.613502 sec - 2,484,545,734 cycles # 2.920 GHz - 3,754,293,361 instructions # 1.51 insn per cycle - 0.912823393 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.622428e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.843086e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.848194e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.126457 sec + 3,453,955,010 cycles:u # 2.984 GHz (74.93%) + 21,196,879 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.18%) + 853,666,812 stalled-cycles-backend:u # 24.72% backend cycles idle (75.18%) + 3,203,854,630 instructions:u # 0.93 insn per cycle + # 0.27 stalled cycles per insn (75.02%) + 1.177341147 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.455745e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.468100e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.468100e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.696763 sec - 19,991,114,547 cycles # 2.984 GHz - 59,158,816,657 instructions # 2.96 insn per cycle - 6.703698004 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.871634e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.883278e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.883278e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.729194 sec + 20,128,898,569 cycles:u # 3.501 GHz (74.96%) + 2,785,588 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 3,920,733,211 stalled-cycles-backend:u # 19.48% backend cycles idle (74.96%) + 59,182,030,526 instructions:u # 2.94 insn per cycle + # 0.07 stalled cycles per insn (74.97%) + 5.752521724 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.903362e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.951737e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.951737e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.365217 sec - 10,110,454,300 cycles # 3.000 GHz - 29,763,982,937 instructions # 2.94 insn per cycle - 3.380160698 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.102145e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.154461e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.154461e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.708500 sec + 9,541,976,491 cycles:u # 3.495 GHz (74.85%) + 2,438,086 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.84%) + 2,419,701,974 stalled-cycles-backend:u # 25.36% backend cycles idle (74.98%) + 29,758,024,267 instructions:u # 3.12 insn per cycle + # 0.08 stalled cycles per insn (75.09%) + 2.733342510 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.586442e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.775935e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.775935e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.732951 sec - 4,886,607,168 cycles # 2.812 GHz - 11,200,637,765 instructions # 2.29 insn per cycle - 1.744637675 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.245529e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.267081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.267081e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.338833 sec + 4,724,288,846 cycles:u # 3.473 GHz (74.72%) + 2,391,653 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) + 1,570,899,905 stalled-cycles-backend:u # 33.25% backend cycles idle (74.89%) + 11,230,417,054 instructions:u # 2.38 insn per cycle + # 0.14 stalled cycles per insn (75.15%) + 1.363499703 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.111734e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.136544e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.500096 sec - 4,242,297,740 cycles # 2.822 GHz - 10,146,067,744 instructions # 2.39 insn per cycle - 1.512418724 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.683269e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.802496e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.802496e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.157833 sec - 4,010,657,164 cycles # 1.855 GHz - 5,838,670,469 instructions # 1.46 insn per cycle - 2.171817278 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 0a96473f8d..ca039e8e9e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-27_18:36:14 +DATE: 2024-01-28_13:13:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.706357e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.039489e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.053162e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.466869 sec - 2,024,358,514 cycles # 2.916 GHz - 2,905,232,876 instructions # 1.44 insn per cycle - 0.763744307 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.923635e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.080806e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.086473e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.526606 sec + 1,546,779,017 cycles:u # 2.806 GHz (73.78%) + 8,361,783 stalled-cycles-frontend:u # 0.54% frontend cycles idle (73.85%) + 297,190,296 stalled-cycles-backend:u # 19.21% backend cycles idle (75.51%) + 1,818,934,608 instructions:u # 1.18 insn per cycle + # 0.16 stalled cycles per insn (75.28%) + 0.568562997 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.070573e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.306663e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.320777e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.604637 sec - 2,476,191,893 cycles # 2.932 GHz - 3,698,353,250 instructions # 1.49 insn per cycle - 0.904013775 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.600430e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.833337e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.838870e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.122231 sec + 3,467,237,376 cycles:u # 3.012 GHz (74.99%) + 21,359,651 stalled-cycles-frontend:u # 0.62% frontend cycles idle (75.00%) + 853,272,050 stalled-cycles-backend:u # 24.61% backend cycles idle (75.23%) + 3,214,568,864 instructions:u # 0.93 insn per cycle + # 0.27 stalled cycles per insn (75.19%) + 1.172263402 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.506662e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.519241e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.519241e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.560428 sec - 19,794,820,917 cycles # 3.016 GHz - 58,706,436,643 instructions # 2.97 insn per cycle - 6.568655504 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.896449e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.908309e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.908309e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.680023 sec + 19,974,916,799 cycles:u # 3.503 GHz (74.94%) + 2,710,663 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 3,510,471,735 stalled-cycles-backend:u # 17.57% backend cycles idle (75.03%) + 58,726,429,808 instructions:u # 2.94 insn per cycle + # 0.06 stalled cycles per insn (75.03%) + 5.704403658 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.814384e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.862259e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.862259e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.426889 sec - 10,132,275,681 cycles # 2.953 GHz - 30,161,186,267 instructions # 2.98 insn per cycle - 3.442826711 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.162282e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.215492e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.215492e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.682251 sec + 9,442,855,192 cycles:u # 3.492 GHz (74.85%) + 2,428,870 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) + 2,135,949,636 stalled-cycles-backend:u # 22.62% backend cycles idle (74.94%) + 30,202,697,241 instructions:u # 3.20 insn per cycle + # 0.07 stalled cycles per insn (75.07%) + 2.707217238 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.985614e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.154939e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.154939e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.847199 sec - 5,039,612,145 cycles # 2.722 GHz - 11,663,713,946 instructions # 2.31 insn per cycle - 1.860008244 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.219654e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.240294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240294e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.366512 sec + 4,827,563,399 cycles:u # 3.479 GHz (74.66%) + 2,406,118 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) + 1,568,708,705 stalled-cycles-backend:u # 32.49% backend cycles idle (75.02%) + 11,675,178,552 instructions:u # 2.42 insn per cycle + # 0.13 stalled cycles per insn (75.22%) + 1.390974056 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.036649e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.058621e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058621e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.604013 sec - 4,559,317,270 cycles # 2.834 GHz - 10,788,553,583 instructions # 2.37 insn per cycle - 1.619683290 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.650902e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.770775e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.770775e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.167007 sec - 4,062,674,429 cycles # 1.871 GHz - 6,072,986,165 instructions # 1.49 insn per cycle - 2.180347983 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 8748ec80e8..4072135c25 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_18:36:44 +DATE: 2024-01-28_13:13:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.464977e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.493570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.496338e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.528543 sec - 2,270,184,097 cycles # 2.943 GHz - 3,425,872,249 instructions # 1.51 insn per cycle - 0.844253845 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.463763e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.620026e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.621227e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.644635 sec + 1,987,376,841 cycles:u # 2.988 GHz (74.74%) + 2,457,671 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.57%) + 48,993,592 stalled-cycles-backend:u # 2.47% backend cycles idle (74.73%) + 2,144,827,242 instructions:u # 1.08 insn per cycle + # 0.02 stalled cycles per insn (75.31%) + 0.688992738 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.124349e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.158647e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.160156e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.035919 sec - 9,807,716,592 cycles # 2.977 GHz - 21,076,351,164 instructions # 2.15 insn per cycle - 3.350689954 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.244104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.246928e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246989e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.378258 sec + 28,854,966,045 cycles:u # 3.432 GHz (74.98%) + 11,706,323 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) + 1,119,668,748 stalled-cycles-backend:u # 3.88% backend cycles idle (75.00%) + 22,680,994,886 instructions:u # 0.79 insn per cycle + # 0.05 stalled cycles per insn (74.98%) + 8.429615859 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.874609e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.875505e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.875505e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.759617 sec - 26,426,534,581 cycles # 3.016 GHz - 81,752,523,728 instructions # 3.09 insn per cycle - 8.766841308 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.214110e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.214999e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.214999e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.414695 sec + 26,053,581,658 cycles:u # 3.504 GHz (74.97%) + 11,686,455 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.02%) + 3,872,816,291 stalled-cycles-backend:u # 14.86% backend cycles idle (74.99%) + 81,778,201,866 instructions:u # 3.14 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 7.438325461 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.754471e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.757856e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.757856e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.380696 sec - 12,879,948,356 cycles # 2.938 GHz - 39,242,098,075 instructions # 3.05 insn per cycle - 4.396802628 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.016879e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.021423e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.021423e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.276622 sec + 11,519,938,608 cycles:u # 3.493 GHz (75.01%) + 1,061,411 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 1,601,322,638 stalled-cycles-backend:u # 13.90% backend cycles idle (75.03%) + 39,253,937,285 instructions:u # 3.41 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 3.301260420 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.416832e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.434424e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.434424e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.958579 sec - 5,557,464,463 cycles # 2.831 GHz - 13,789,270,718 instructions # 2.48 insn per cycle - 1.973620354 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.199974e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.202544e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.202544e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.374389 sec + 4,859,374,228 cycles:u # 3.481 GHz (74.89%) + 725,368 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.79%) + 611,853,703 stalled-cycles-backend:u # 12.59% backend cycles idle (74.79%) + 13,816,024,978 instructions:u # 2.84 insn per cycle + # 0.04 stalled cycles per insn (74.83%) + 1.399253157 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.535315e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.557539e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.557539e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.730276 sec - 4,897,094,506 cycles # 2.823 GHz - 12,318,633,778 instructions # 2.52 insn per cycle - 1.748142491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.350731e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.364351e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.364351e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.242637 sec - 4,057,109,602 cycles # 1.806 GHz - 6,286,690,411 instructions # 1.55 insn per cycle - 2.253700963 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index fd89ab8868..49977cc58b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:24:17 +DATE: 2024-01-28_13:47:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.109503e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.454645e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.454645e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.516589 sec - 2,186,982,677 cycles # 2.925 GHz - 3,445,591,612 instructions # 1.58 insn per cycle - 0.807789349 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.390201e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.527612e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.527612e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.654035 sec + 1,960,717,262 cycles:u # 2.905 GHz (75.00%) + 2,777,141 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.21%) + 33,974,103 stalled-cycles-backend:u # 1.73% backend cycles idle (75.21%) + 2,149,307,410 instructions:u # 1.10 insn per cycle + # 0.02 stalled cycles per insn (75.22%) + 0.698063329 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.604275e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.097164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.097164e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.331099 sec - 10,692,447,995 cycles # 2.965 GHz - 23,347,460,347 instructions # 2.18 insn per cycle - 3.661893165 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.211104e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.245812e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245812e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.532378 sec + 29,230,109,981 cycles:u # 3.409 GHz (75.01%) + 22,642,081 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) + 1,132,164,723 stalled-cycles-backend:u # 3.87% backend cycles idle (75.00%) + 23,523,574,924 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (74.96%) + 8.593019725 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.878015e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.878871e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.878871e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.747004 sec - 26,454,331,369 cycles # 3.023 GHz - 81,758,459,626 instructions # 3.09 insn per cycle - 8.752513277 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.203965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204858e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204858e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.452449 sec + 26,155,459,895 cycles:u # 3.500 GHz (74.95%) + 19,996,051 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.91%) + 3,848,748,935 stalled-cycles-backend:u # 14.71% backend cycles idle (74.97%) + 81,737,604,098 instructions:u # 3.13 insn per cycle + # 0.05 stalled cycles per insn (75.06%) + 7.476303530 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.775861e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.779543e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.779543e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.357884 sec - 12,894,661,106 cycles # 2.956 GHz - 39,253,566,522 instructions # 3.04 insn per cycle - 4.363419681 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.014996e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.019514e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.019514e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.281547 sec + 11,541,920,268 cycles:u # 3.494 GHz (74.90%) + 1,107,312 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 1,695,248,873 stalled-cycles-backend:u # 14.69% backend cycles idle (75.06%) + 39,247,079,084 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 3.306741027 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.412414e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.430210e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.430210e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.963936 sec - 5,573,295,220 cycles # 2.832 GHz - 13,799,743,853 instructions # 2.48 insn per cycle - 1.969915875 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.194708e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.197264e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.197264e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.383983 sec + 4,882,147,839 cycles:u # 3.472 GHz (75.05%) + 3,172,657 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 581,410,592 stalled-cycles-backend:u # 11.91% backend cycles idle (74.97%) + 13,796,051,404 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 1.409166602 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.515821e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.539804e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.539804e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.737334 sec - 4,912,657,678 cycles # 2.821 GHz - 12,328,208,012 instructions # 2.51 insn per cycle - 1.742657060 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.174374e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.188369e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.188369e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.300281 sec - 4,068,031,162 cycles # 1.766 GHz - 6,296,983,262 instructions # 1.55 insn per cycle - 2.305872630 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index ba3e5a6d39..c540183c31 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:36:16 +DATE: 2024-01-28_13:56:28 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.502781e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.530729e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533274e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.425613e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.588900e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.590114e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.511281 sec - 2,189,845,313 cycles # 2.932 GHz - 3,420,225,928 instructions # 1.56 insn per cycle - 0.809530054 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.647005 sec + 1,961,714,283 cycles:u # 2.934 GHz (74.69%) + 2,370,613 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.34%) + 37,765,063 stalled-cycles-backend:u # 1.93% backend cycles idle (75.10%) + 2,176,691,970 instructions:u # 1.11 insn per cycle + # 0.02 stalled cycles per insn (74.75%) + 0.689251287 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.147307e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.181790e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.183262e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.243727e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.247395e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247456e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.129728 sec - 10,090,248,576 cycles # 2.983 GHz - 21,741,294,339 instructions # 2.15 insn per cycle - 3.440954058 seconds time elapsed +TOTAL : 8.384949 sec + 28,823,443,221 cycles:u # 3.423 GHz (75.00%) + 11,685,573 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.02%) + 1,119,948,130 stalled-cycles-backend:u # 3.89% backend cycles idle (75.00%) + 22,700,974,781 instructions:u # 0.79 insn per cycle + # 0.05 stalled cycles per insn (75.00%) + 8.437711468 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865476e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.866367e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.866367e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.205977e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.206957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.206957e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.805569 sec - 26,448,557,229 cycles # 3.003 GHz - 81,752,712,519 instructions # 3.09 insn per cycle - 8.810814165 seconds time elapsed +TOTAL : 7.441977 sec + 26,144,784,083 cycles:u # 3.503 GHz (74.97%) + 19,757,570 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.02%) + 3,926,698,235 stalled-cycles-backend:u # 15.02% backend cycles idle (75.03%) + 81,747,372,330 instructions:u # 3.13 insn per cycle + # 0.05 stalled cycles per insn (75.03%) + 7.465457036 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.674042e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.677471e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.677471e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.016218e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.020848e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.020848e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.475227 sec - 12,907,401,540 cycles # 2.882 GHz - 39,240,721,411 instructions # 3.04 insn per cycle - 4.480266384 seconds time elapsed +TOTAL : 3.277056 sec + 11,521,723,848 cycles:u # 3.494 GHz (75.02%) + 1,067,616 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 1,661,234,169 stalled-cycles-backend:u # 14.42% backend cycles idle (75.02%) + 39,259,477,727 instructions:u # 3.41 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 3.299831090 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.337116e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.355239e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.355239e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.202912e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.205517e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.205517e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.978873 sec - 5,565,688,850 cycles # 2.807 GHz - 13,787,538,268 instructions # 2.48 insn per cycle - 1.983941242 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.370984 sec + 4,837,328,450 cycles:u # 3.475 GHz (74.72%) + 783,029 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.76%) + 584,048,854 stalled-cycles-backend:u # 12.07% backend cycles idle (74.91%) + 13,836,924,664 instructions:u # 2.86 insn per cycle + # 0.04 stalled cycles per insn (75.16%) + 1.393799936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.506300e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.530161e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.530161e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.736969 sec - 4,905,247,441 cycles # 2.818 GHz - 12,316,173,805 instructions # 2.51 insn per cycle - 1.741977898 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.394634e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.409308e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.409308e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.230744 sec - 4,063,689,523 cycles # 1.819 GHz - 6,283,713,565 instructions # 1.55 insn per cycle - 2.235926812 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 4fc77c5f3b..6361605e5d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,223 +1,143 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:32:51 +DATE: 2024-01-28_13:54:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.474359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.503184e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.505889e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.506872 sec - 2,190,591,729 cycles # 2.928 GHz - 3,363,193,614 instructions # 1.54 insn per cycle - 0.813143037 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 54,670,069 cycles:u # 2.630 GHz (61.54%) + 34,994 stalled-cycles-frontend:u # 0.06% frontend cycles idle (61.54%) + 616,163 stalled-cycles-backend:u # 1.13% backend cycles idle (61.54%) + 42,435,100 instructions:u # 0.78 insn per cycle + # 0.01 stalled cycles per insn (63.55%) + 0.021700778 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.142083e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176424e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.177917e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.068584 sec - 9,750,888,418 cycles # 2.933 GHz - 22,498,978,506 instructions # 2.31 insn per cycle - 3.380971345 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 44,892,864 cycles:u # 2.190 GHz (61.00%) + 64,258 stalled-cycles-frontend:u # 0.14% frontend cycles idle (61.01%) + 460,979 stalled-cycles-backend:u # 1.03% backend cycles idle (61.00%) + 48,182,281 instructions:u # 1.07 insn per cycle + # 0.01 stalled cycles per insn (70.82%) + 0.021357952 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.865848e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.866726e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.866726e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.799928 sec - 26,485,507,873 cycles # 3.009 GHz - 81,753,130,978 instructions # 3.09 insn per cycle - 8.805031066 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted + 41,803,467 cycles:u # 2.035 GHz (61.10%) + 61,169 stalled-cycles-frontend:u # 0.15% frontend cycles idle (61.10%) + 353,876 stalled-cycles-backend:u # 0.85% backend cycles idle (61.27%) + 48,282,000 instructions:u # 1.15 insn per cycle + # 0.01 stalled cycles per insn (74.13%) + 0.022261377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.783072e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.786617e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.786617e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.345759 sec - 12,887,960,701 cycles # 2.964 GHz - 39,240,971,185 instructions # 3.04 insn per cycle - 4.350680718 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted + 50,214,491 cycles:u # 2.486 GHz (60.43%) + 49,776 stalled-cycles-frontend:u # 0.10% frontend cycles idle (60.43%) + 575,575 stalled-cycles-backend:u # 1.15% backend cycles idle (60.43%) + 45,981,314 instructions:u # 0.92 insn per cycle + # 0.01 stalled cycles per insn (64.01%) + 0.021462604 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.401098e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.418172e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.418172e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.962097 sec - 5,557,008,460 cycles # 2.826 GHz - 13,788,303,853 instructions # 2.48 insn per cycle - 1.967376308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted + 39,374,385 cycles:u # 1.907 GHz (61.30%) + 58,366 stalled-cycles-frontend:u # 0.15% frontend cycles idle (61.30%) + 361,633 stalled-cycles-backend:u # 0.92% backend cycles idle (56.85%) + 49,010,405 instructions:u # 1.24 insn per cycle + # 0.01 stalled cycles per insn (76.20%) + 0.021854446 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.166638e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.188094e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.188094e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.799007 sec - 5,088,916,448 cycles # 2.823 GHz - 12,318,256,629 instructions # 2.42 insn per cycle - 1.804299971 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.442905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.457256e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.457256e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.213655 sec - 4,060,302,969 cycles # 1.831 GHz - 6,285,302,844 instructions # 1.55 insn per cycle - 2.218845333 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 1d0a9ae11e..6617ca121c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,226 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:29:31 +DATE: 2024-01-28_13:52:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.200930e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.503716e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.506421e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.511242 sec - 2,174,984,659 cycles # 2.945 GHz - 3,418,296,130 instructions # 1.57 insn per cycle - 0.801197579 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.455736e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.587953e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.589041e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.644943 sec + 1,960,906,381 cycles:u # 2.930 GHz (74.99%) + 2,678,732 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.16%) + 41,502,500 stalled-cycles-backend:u # 2.12% backend cycles idle (75.09%) + 2,164,012,883 instructions:u # 1.10 insn per cycle + # 0.02 stalled cycles per insn (74.77%) + 0.685694018 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.739270e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178124e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.204802 sec - 10,389,625,524 cycles # 3.004 GHz - 21,552,317,551 instructions # 2.07 insn per cycle - 3.515594954 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.214834e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.247081e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.247142e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.477859 sec + 29,184,033,298 cycles:u # 3.428 GHz (74.95%) + 23,042,044 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) + 1,124,504,964 stalled-cycles-backend:u # 3.85% backend cycles idle (75.01%) + 23,480,301,219 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (75.04%) + 8.533410082 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.881683e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.882597e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.882597e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.725568 sec - 26,442,204,272 cycles # 3.029 GHz - 81,754,472,166 instructions # 3.09 insn per cycle - 8.731630799 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.203599e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204480e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204480e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.450038 sec + 26,153,434,105 cycles:u # 3.501 GHz (74.94%) + 21,271,365 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.95%) + 3,954,646,940 stalled-cycles-backend:u # 15.12% backend cycles idle (75.00%) + 81,744,725,764 instructions:u # 3.13 insn per cycle + # 0.05 stalled cycles per insn (75.05%) + 7.472964771 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.719790e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.723385e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.723385e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.419535 sec - 12,903,853,279 cycles # 2.917 GHz - 39,240,792,929 instructions # 3.04 insn per cycle - 4.424473414 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.015183e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.019740e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.019740e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.277876 sec + 11,548,473,130 cycles:u # 3.501 GHz (74.92%) + 1,063,310 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 1,663,417,717 stalled-cycles-backend:u # 14.40% backend cycles idle (75.03%) + 39,248,215,923 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 3.300947686 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.390147e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.408683e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.408683e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.965025 sec - 5,556,583,212 cycles # 2.822 GHz - 13,788,397,685 instructions # 2.48 insn per cycle - 1.970292377 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.197867e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.200436e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.200436e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.376750 sec + 4,853,545,908 cycles:u # 3.473 GHz (74.82%) + 745,705 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.82%) + 584,157,548 stalled-cycles-backend:u # 12.04% backend cycles idle (74.69%) + 13,847,035,835 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 1.399345898 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.343307e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.365779e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.365779e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.765569 sec - 4,897,889,150 cycles # 2.769 GHz - 12,318,083,328 instructions # 2.51 insn per cycle - 1.770720541 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.419663e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.434340e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.434340e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.220750 sec - 4,055,797,181 cycles # 1.823 GHz - 6,285,553,359 instructions # 1.55 insn per cycle - 2.225912970 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 6ec4da2ebd..d771088f9e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_18:37:21 +DATE: 2024-01-28_13:14:00 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.443994e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.472121e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.475106e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.529621 sec - 2,179,867,827 cycles # 2.817 GHz - 3,377,723,181 instructions # 1.55 insn per cycle - 0.844194090 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.380880e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.440819e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.441063e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.526573 sec + 1,550,068,082 cycles:u # 2.831 GHz (74.80%) + 2,379,457 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.92%) + 41,567,895 stalled-cycles-backend:u # 2.68% backend cycles idle (75.11%) + 1,842,006,083 instructions:u # 1.19 insn per cycle + # 0.02 stalled cycles per insn (75.31%) + 0.569023795 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.133949e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.168641e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.170113e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.040771 sec - 9,449,850,956 cycles # 2.864 GHz - 21,228,312,642 instructions # 2.25 insn per cycle - 3.355844186 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.738074e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.743433e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.743544e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 7.022676 sec + 24,105,089,398 cycles:u # 3.417 GHz (74.92%) + 11,392,847 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.96%) + 1,121,387,423 stalled-cycles-backend:u # 4.65% backend cycles idle (75.04%) + 19,016,168,216 instructions:u # 0.79 insn per cycle + # 0.06 stalled cycles per insn (75.06%) + 7.072805960 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.871145e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.872027e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.872027e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.775522 sec - 26,467,836,514 cycles # 3.015 GHz - 81,778,710,330 instructions # 3.09 insn per cycle - 8.783029262 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.209445e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.210340e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.210340e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.430105 sec + 26,085,936,101 cycles:u # 3.501 GHz (74.98%) + 11,236,918 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.98%) + 3,350,397,424 stalled-cycles-backend:u # 12.84% backend cycles idle (74.99%) + 81,795,777,968 instructions:u # 3.14 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 7.453494108 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.712170e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.715556e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.715556e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.427464 sec - 12,911,058,764 cycles # 2.914 GHz - 39,248,548,650 instructions # 3.04 insn per cycle - 4.443782704 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.970390e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.974839e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.974839e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.306999 sec + 11,648,485,988 cycles:u # 3.500 GHz (74.98%) + 9,866,301 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) + 1,494,462,993 stalled-cycles-backend:u # 12.83% backend cycles idle (75.00%) + 39,254,097,381 instructions:u # 3.37 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 3.331484334 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.404510e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.421856e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.421856e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.961012 sec - 5,550,780,310 cycles # 2.824 GHz - 13,804,627,273 instructions # 2.49 insn per cycle - 1.973888367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.197117e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199676e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.199676e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.377372 sec + 4,863,607,601 cycles:u # 3.477 GHz (74.84%) + 1,813,839 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.84%) + 599,987,605 stalled-cycles-backend:u # 12.34% backend cycles idle (74.84%) + 13,847,758,852 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (74.89%) + 1.401964703 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.427377e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.450136e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.450136e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.749655 sec - 4,882,694,724 cycles # 2.783 GHz - 12,329,545,304 instructions # 2.53 insn per cycle - 1.763737429 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.530178e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.544294e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.544294e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.188783 sec - 4,047,765,395 cycles # 1.846 GHz - 6,292,916,815 instructions # 1.55 insn per cycle - 2.202791681 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 719bba46fb..5a0d1e0231 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:13:06 +DATE: 2024-01-28_13:33:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.222007e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.246519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.249113e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.532665 sec - 2,258,388,500 cycles # 2.939 GHz - 3,394,739,067 instructions # 1.50 insn per cycle - 0.825914000 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.432864e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.607586e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.609131e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.647725 sec + 1,965,032,551 cycles:u # 2.938 GHz (74.40%) + 2,457,806 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.84%) + 41,722,257 stalled-cycles-backend:u # 2.12% backend cycles idle (74.83%) + 2,120,082,784 instructions:u # 1.08 insn per cycle + # 0.02 stalled cycles per insn (75.87%) + 0.693866618 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.769378e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.798106e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.799313e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.307089 sec - 10,656,119,044 cycles # 2.993 GHz - 24,479,944,936 instructions # 2.30 insn per cycle - 3.619320255 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.246947e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.249779e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.249840e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.369635 sec + 28,801,506,279 cycles:u # 3.428 GHz (74.94%) + 11,544,325 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) + 1,125,531,027 stalled-cycles-backend:u # 3.91% backend cycles idle (75.06%) + 22,691,924,055 instructions:u # 0.79 insn per cycle + # 0.05 stalled cycles per insn (75.11%) + 8.425153536 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.367139e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.367639e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.367639e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.563805 sec - 113,035,005,873 cycles # 3.009 GHz - 141,513,387,724 instructions # 1.25 insn per cycle - 37.569095568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.588775e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589159e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.589159e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 35.747768 sec + 125,373,892,272 cycles:u # 3.505 GHz (75.00%) + 90,420,899 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) + 18,811,489,489 stalled-cycles-backend:u # 15.00% backend cycles idle (74.99%) + 141,480,205,816 instructions:u # 1.13 insn per cycle + # 0.13 stalled cycles per insn (75.00%) + 35.771306562 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.215278e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.217898e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.217898e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.110819 sec - 14,991,773,237 cycles # 2.931 GHz - 37,532,307,514 instructions # 2.50 insn per cycle - 5.116400936 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.643252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.645635e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.645635e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.509497 sec + 15,846,751,189 cycles:u # 3.497 GHz (74.93%) + 1,161,342 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) + 6,805,781,414 stalled-cycles-backend:u # 42.95% backend cycles idle (74.94%) + 37,571,663,160 instructions:u # 2.37 insn per cycle + # 0.18 stalled cycles per insn (74.99%) + 4.534445045 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.673563e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.688591e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.688591e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.147516 sec - 6,037,066,805 cycles # 2.806 GHz - 12,947,337,518 instructions # 2.14 insn per cycle - 2.152800643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.568945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.579223e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.579223e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.174735 sec + 7,662,049,435 cycles:u # 3.489 GHz (74.93%) + 1,838,193 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) + 4,393,495,514 stalled-cycles-backend:u # 57.34% backend cycles idle (74.87%) + 12,965,579,204 instructions:u # 1.69 insn per cycle + # 0.34 stalled cycles per insn (74.89%) + 2.199658630 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.216341e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.238951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.238951e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.789912 sec - 5,003,070,633 cycles # 2.789 GHz - 11,363,542,829 instructions # 2.27 insn per cycle - 1.795109385 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.733648e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.748517e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.748517e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.131221 sec - 3,906,829,492 cycles # 1.830 GHz - 5,854,010,142 instructions # 1.50 insn per cycle - 2.136494313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 513e439bd0..4cdfc5e542 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:14:16 +DATE: 2024-01-28_13:34:37 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.223781e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.250560e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.253073e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.534925 sec - 2,229,964,752 cycles # 2.924 GHz - 3,438,029,140 instructions # 1.54 insn per cycle - 0.823050876 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.418171e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.479591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.480154e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.525474 sec + 1,515,282,519 cycles:u # 2.761 GHz (75.24%) + 2,314,963 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.16%) + 49,253,571 stalled-cycles-backend:u # 3.25% backend cycles idle (75.70%) + 1,836,383,781 instructions:u # 1.21 insn per cycle + # 0.03 stalled cycles per insn (75.69%) + 0.574068605 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.793172e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.823326e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.282737 sec - 10,537,359,774 cycles # 2.978 GHz - 22,975,518,499 instructions # 2.18 insn per cycle - 3.596154660 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.738621e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.744167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.744286e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 7.027729 sec + 24,090,392,814 cycles:u # 3.412 GHz (74.96%) + 11,578,237 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) + 1,124,890,854 stalled-cycles-backend:u # 4.67% backend cycles idle (75.06%) + 18,999,230,554 instructions:u # 0.79 insn per cycle + # 0.06 stalled cycles per insn (75.08%) + 7.079579939 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.315697e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.316172e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.316172e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.011955 sec - 114,307,951,290 cycles # 3.007 GHz - 141,699,174,548 instructions # 1.24 insn per cycle - 38.017244555 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.525947e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.526320e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.526320e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 36.243408 sec + 127,088,498,504 cycles:u # 3.505 GHz (74.99%) + 73,503,075 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) + 17,996,871,744 stalled-cycles-backend:u # 14.16% backend cycles idle (75.01%) + 141,684,901,564 instructions:u # 1.11 insn per cycle + # 0.13 stalled cycles per insn (75.01%) + 36.267040644 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.178101e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.180722e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.180722e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.170239 sec - 14,906,390,957 cycles # 2.881 GHz - 37,595,371,004 instructions # 2.52 insn per cycle - 5.175326172 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.601252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.603641e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.603641e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.561717 sec + 16,042,436,101 cycles:u # 3.500 GHz (74.91%) + 11,366,493 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.93%) + 6,568,426,217 stalled-cycles-backend:u # 40.94% backend cycles idle (75.00%) + 37,595,433,276 instructions:u # 2.34 insn per cycle + # 0.17 stalled cycles per insn (75.04%) + 4.586797272 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.835494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.851184e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.851184e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.106511 sec - 5,931,759,794 cycles # 2.813 GHz - 12,831,713,311 instructions # 2.16 insn per cycle - 2.111663205 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.703634e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.714267e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.714267e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.136636 sec + 7,530,800,749 cycles:u # 3.489 GHz (74.86%) + 765,598 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) + 4,267,023,774 stalled-cycles-backend:u # 56.66% backend cycles idle (74.83%) + 12,869,855,404 instructions:u # 1.71 insn per cycle + # 0.33 stalled cycles per insn (74.99%) + 2.161501278 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.356622e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.379915e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.379915e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.763132 sec - 4,989,376,638 cycles # 2.823 GHz - 11,359,472,763 instructions # 2.28 insn per cycle - 1.768435525 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.751418e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.767332e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.767332e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.126080 sec - 3,898,010,783 cycles # 1.830 GHz - 5,843,135,780 instructions # 1.50 insn per cycle - 2.131353524 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 05c77ff41e..95d88aeb3b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_18:37:59 +DATE: 2024-01-28_13:14:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.317460e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.368942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.376232e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.485177 sec - 2,060,153,436 cycles # 2.906 GHz - 3,002,003,985 instructions # 1.46 insn per cycle - 0.795438508 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.571689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.781012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.782456e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.433434 sec + 1,206,803,698 cycles:u # 2.663 GHz (75.31%) + 2,902,559 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.13%) + 50,941,529 stalled-cycles-backend:u # 4.22% backend cycles idle (75.27%) + 1,597,895,307 instructions:u # 1.32 insn per cycle + # 0.03 stalled cycles per insn (75.87%) + 0.476222044 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.516597e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.590115e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.593568e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.727811 sec - 5,842,248,421 cycles # 2.980 GHz - 11,578,097,409 instructions # 1.98 insn per cycle - 2.017768230 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.697888e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.731044e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731472e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.294945 sec + 11,116,405,588 cycles:u # 3.346 GHz (74.89%) + 27,989,650 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.02%) + 1,135,046,392 stalled-cycles-backend:u # 10.21% backend cycles idle (74.96%) + 8,992,520,587 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (74.96%) + 3.342805006 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.052118e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.053165e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.053165e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.003896 sec - 24,216,644,613 cycles # 3.025 GHz - 75,876,682,144 instructions # 3.13 insn per cycle - 8.010960986 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.453805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.454845e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.454845e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.689474 sec + 23,500,390,704 cycles:u # 3.502 GHz (74.96%) + 1,349,268 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,707,310,950 stalled-cycles-backend:u # 11.52% backend cycles idle (74.96%) + 75,903,291,879 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 6.712419100 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.403824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.417702e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.417702e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.224003 sec - 6,486,236,444 cycles # 2.911 GHz - 20,115,514,239 instructions # 3.10 insn per cycle - 2.239579450 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.874746e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.892103e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.892103e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.666769 sec + 5,880,877,173 cycles:u # 3.485 GHz (74.88%) + 736,307 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.89%) + 919,739,361 stalled-cycles-backend:u # 15.64% backend cycles idle (74.89%) + 20,182,215,345 instructions:u # 3.43 insn per cycle + # 0.05 stalled cycles per insn (74.78%) + 1.690998280 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.590551e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.597032e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.597032e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.040535 sec - 2,821,593,769 cycles # 2.701 GHz - 7,038,300,357 instructions # 2.49 insn per cycle - 1.056476360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.349433e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.359555e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359555e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.704009 sec + 2,508,283,415 cycles:u # 3.459 GHz (74.63%) + 560,879 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.63%) + 244,621,289 stalled-cycles-backend:u # 9.75% backend cycles idle (74.63%) + 7,097,199,877 instructions:u # 2.83 insn per cycle + # 0.03 stalled cycles per insn (74.72%) + 0.728264552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.875158e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883769e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883769e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.883644 sec - 2,479,108,367 cycles # 2.791 GHz - 6,280,326,617 instructions # 2.53 insn per cycle - 0.896889444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.414115e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419249e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419249e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.168827 sec - 2,036,775,596 cycles # 1.736 GHz - 3,248,885,824 instructions # 1.60 insn per cycle - 1.182247107 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index ec70d3a329..122bc95d94 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:24:55 +DATE: 2024-01-28_13:47:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.606611e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.289869e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.289869e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.470000 sec - 2,011,017,737 cycles # 2.923 GHz - 2,948,669,744 instructions # 1.47 insn per cycle - 0.746672929 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.592507e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.767585e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.767585e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.438826 sec + 1,210,310,947 cycles:u # 2.619 GHz (75.63%) + 3,071,241 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.78%) + 46,204,045 stalled-cycles-backend:u # 3.82% backend cycles idle (75.54%) + 1,612,879,644 instructions:u # 1.33 insn per cycle + # 0.03 stalled cycles per insn (74.77%) + 0.482142933 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.237746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.479275e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.479275e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.902902 sec - 6,321,067,258 cycles # 2.957 GHz - 12,132,072,792 instructions # 1.92 insn per cycle - 2.198373975 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.268741e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711405e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.711405e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.442199 sec + 11,495,778,661 cycles:u # 3.309 GHz (74.91%) + 38,646,041 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.05%) + 1,141,934,285 stalled-cycles-backend:u # 9.93% backend cycles idle (75.05%) + 9,990,513,371 instructions:u # 0.87 insn per cycle + # 0.11 stalled cycles per insn (75.02%) + 3.496820444 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.050984e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.052020e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.052020e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.008133 sec - 24,231,473,098 cycles # 3.025 GHz - 75,883,434,879 instructions # 3.13 insn per cycle - 8.013586367 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.452635e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.453690e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.453690e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.694669 sec + 23,515,293,319 cycles:u # 3.502 GHz (74.99%) + 1,375,662 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 2,849,268,325 stalled-cycles-backend:u # 12.12% backend cycles idle (74.99%) + 75,874,654,450 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 6.718179574 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.414640e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.429427e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.429427e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.223220 sec - 6,514,807,099 cycles # 2.925 GHz - 20,124,720,070 instructions # 3.09 insn per cycle - 2.228499201 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.888970e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.906318e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.906318e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.666375 sec + 5,877,271,334 cycles:u # 3.482 GHz (74.88%) + 717,296 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) + 874,212,055 stalled-cycles-backend:u # 14.87% backend cycles idle (74.88%) + 20,187,147,538 instructions:u # 3.43 insn per cycle + # 0.04 stalled cycles per insn (74.74%) + 1.691273437 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.662841e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.670032e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.670032e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.997374 sec - 2,830,956,139 cycles # 2.827 GHz - 7,047,558,426 instructions # 2.49 insn per cycle - 1.002648676 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.894795e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.904037e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.904037e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.876913 sec - 2,489,066,136 cycles # 2.825 GHz - 6,289,333,997 instructions # 2.53 insn per cycle - 0.882075649 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.354855e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.364991e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.364991e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.704324 sec + 2,502,599,529 cycles:u # 3.449 GHz (74.65%) + 1,026,743 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.65%) + 245,383,676 stalled-cycles-backend:u # 9.81% backend cycles idle (74.65%) + 7,099,256,311 instructions:u # 2.84 insn per cycle + # 0.03 stalled cycles per insn (74.74%) + 0.729305773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.500045e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.505997e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.505997e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.107321 sec - 2,047,079,556 cycles # 1.844 GHz - 3,258,091,397 instructions # 1.59 insn per cycle - 1.112788129 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index ea9c8935ca..6f501681db 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:36:54 +DATE: 2024-01-28_13:56:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.345633e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.400482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.406925e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.467633 sec - 1,992,809,075 cycles # 2.918 GHz - 2,979,298,487 instructions # 1.50 insn per cycle - 0.742813268 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.567605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.783158e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.783801e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.433304 sec + 1,203,789,126 cycles:u # 2.646 GHz (75.11%) + 2,876,739 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.13%) + 47,664,150 stalled-cycles-backend:u # 3.96% backend cycles idle (75.36%) + 1,559,004,535 instructions:u # 1.30 insn per cycle + # 0.03 stalled cycles per insn (75.65%) + 0.473764291 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.572479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.646363e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.649847e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.799926 sec - 6,037,109,549 cycles # 2.977 GHz - 12,003,912,176 instructions # 1.99 insn per cycle - 2.085917224 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.694456e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.725395e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.725816e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.304741 sec + 11,130,877,093 cycles:u # 3.339 GHz (74.92%) + 27,954,603 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.09%) + 1,134,042,955 stalled-cycles-backend:u # 10.19% backend cycles idle (75.10%) + 9,015,184,927 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.03%) + 3.353523598 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.024041e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.025057e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.025057e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.452340e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.453385e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.453385e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.112508 sec - 24,227,662,663 cycles # 2.986 GHz - 75,875,626,096 instructions # 3.13 insn per cycle - 8.117324848 seconds time elapsed +TOTAL : 6.693555 sec + 23,512,441,331 cycles:u # 3.502 GHz (74.98%) + 1,325,303 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 2,795,705,162 stalled-cycles-backend:u # 11.89% backend cycles idle (74.98%) + 75,871,702,156 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 6.716315041 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.392521e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.406919e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.406919e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 2.228816 sec - 6,503,950,010 cycles # 2.914 GHz - 20,114,583,883 instructions # 3.09 insn per cycle - 2.233561370 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.836691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.854293e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.854293e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.673211 sec + 5,901,611,042 cycles:u # 3.484 GHz (74.97%) + 2,928,497 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) + 890,340,098 stalled-cycles-backend:u # 15.09% backend cycles idle (74.98%) + 20,135,891,180 instructions:u # 3.41 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 1.695783923 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.648999e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.656276e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.656276e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 1.004280 sec - 2,822,831,236 cycles # 2.800 GHz - 7,034,381,066 instructions # 2.49 insn per cycle - 1.009123591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.891851e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900995e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900995e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.876735 sec - 2,481,593,013 cycles # 2.816 GHz - 6,275,566,075 instructions # 2.53 insn per cycle - 0.882601647 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.349784e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.359876e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.359876e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.703938 sec + 2,503,013,691 cycles:u # 3.454 GHz (74.62%) + 1,047,319 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.37%) + 252,615,998 stalled-cycles-backend:u # 10.09% backend cycles idle (74.38%) + 7,084,595,149 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (75.18%) + 0.726588217 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.497990e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.503806e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.503806e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.104775 sec - 2,038,939,507 cycles # 1.839 GHz - 3,244,114,954 instructions # 1.59 insn per cycle - 1.109760491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 9df3fb8320..806e2754e6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,223 +1,143 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:33:28 +DATE: 2024-01-28_13:54:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.341989e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.394056e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.399853e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.464721 sec - 1,978,581,147 cycles # 2.928 GHz - 2,992,519,083 instructions # 1.51 insn per cycle - 0.734249164 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 53,369,864 cycles:u # 2.565 GHz (61.58%) + 40,139 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.59%) + 642,753 stalled-cycles-backend:u # 1.20% backend cycles idle (61.59%) + 43,640,815 instructions:u # 0.82 insn per cycle + # 0.01 stalled cycles per insn (63.54%) + 0.021718512 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.563822e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.637597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.640892e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.748604 sec - 5,868,128,408 cycles # 2.968 GHz - 11,897,078,752 instructions # 2.03 insn per cycle - 2.034152439 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 46,067,681 cycles:u # 2.248 GHz (60.99%) + 58,322 stalled-cycles-frontend:u # 0.13% frontend cycles idle (60.99%) + 517,074 stalled-cycles-backend:u # 1.12% backend cycles idle (60.99%) + 47,601,779 instructions:u # 1.03 insn per cycle + # 0.01 stalled cycles per insn (68.07%) + 0.021333955 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.036622e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.037632e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.037632e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.062439 sec - 24,226,666,628 cycles # 3.004 GHz - 75,876,656,478 instructions # 3.13 insn per cycle - 8.067552425 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted + 50,181,869 cycles:u # 2.416 GHz (61.53%) + 45,388 stalled-cycles-frontend:u # 0.09% frontend cycles idle (61.53%) + 590,959 stalled-cycles-backend:u # 1.18% backend cycles idle (61.53%) + 45,782,293 instructions:u # 0.91 insn per cycle + # 0.01 stalled cycles per insn (63.57%) + 0.024908860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.436843e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.451693e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.451693e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.213994 sec - 6,497,793,089 cycles # 2.930 GHz - 20,114,232,625 instructions # 3.10 insn per cycle - 2.218999244 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted + 39,368,932 cycles:u # 1.923 GHz (60.95%) + 59,541 stalled-cycles-frontend:u # 0.15% frontend cycles idle (60.95%) + 360,551 stalled-cycles-backend:u # 0.92% backend cycles idle (56.80%) + 48,866,933 instructions:u # 1.24 insn per cycle + # 0.01 stalled cycles per insn (76.32%) + 0.021733591 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.651046e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.657956e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.657956e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.002210 sec - 2,828,228,234 cycles # 2.811 GHz - 7,037,152,943 instructions # 2.49 insn per cycle - 1.006951321 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted + 54,509,603 cycles:u # 2.643 GHz (61.24%) + 45,858 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.25%) + 657,609 stalled-cycles-backend:u # 1.21% backend cycles idle (61.25%) + 42,090,677 instructions:u # 0.77 insn per cycle + # 0.02 stalled cycles per insn (63.01%) + 0.021908360 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.896408e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.905353e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.905353e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.873460 sec - 2,478,765,611 cycles # 2.825 GHz - 6,279,360,894 instructions # 2.53 insn per cycle - 0.878454117 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.514471e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.520302e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.520302e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.091799 sec - 2,035,517,373 cycles # 1.858 GHz - 3,247,329,615 instructions # 1.60 insn per cycle - 1.096813214 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index bb3df27b99..8f4bea55a8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,226 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:30:08 +DATE: 2024-01-28_13:53:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.731672e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.356045e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.362013e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.466576 sec - 2,012,354,844 cycles # 2.943 GHz - 2,978,053,730 instructions # 1.48 insn per cycle - 0.741085715 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.594064e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.762645e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.763306e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.433891 sec + 1,224,430,513 cycles:u # 2.667 GHz (75.41%) + 2,966,487 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.62%) + 46,035,582 stalled-cycles-backend:u # 3.76% backend cycles idle (75.63%) + 1,580,446,250 instructions:u # 1.29 insn per cycle + # 0.03 stalled cycles per insn (75.55%) + 0.473902530 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.444891e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.622956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626310e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.833515 sec - 6,145,723,719 cycles # 2.985 GHz - 13,015,613,657 instructions # 2.12 insn per cycle - 2.128449848 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.299457e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.729068e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.729503e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.402827 sec + 11,510,047,857 cycles:u # 3.351 GHz (74.82%) + 38,841,732 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.05%) + 1,144,379,586 stalled-cycles-backend:u # 9.94% backend cycles idle (75.08%) + 9,781,410,998 instructions:u # 0.85 insn per cycle + # 0.12 stalled cycles per insn (75.20%) + 3.452964103 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.048991e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.050067e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.050067e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.012669 sec - 24,208,162,061 cycles # 3.020 GHz - 75,876,396,395 instructions # 3.13 insn per cycle - 8.017590721 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.454669e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.455720e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.455720e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.687141 sec + 23,500,859,274 cycles:u # 3.504 GHz (74.96%) + 1,332,192 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,792,659,646 stalled-cycles-backend:u # 11.88% backend cycles idle (74.96%) + 75,931,914,319 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 6.709819182 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.438541e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.453434e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.453434e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.213920 sec - 6,495,629,253 cycles # 2.929 GHz - 20,114,944,757 instructions # 3.10 insn per cycle - 2.219064819 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.891791e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.909604e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.909604e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.664190 sec + 5,869,351,508 cycles:u # 3.483 GHz (74.84%) + 796,441 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) + 858,273,259 stalled-cycles-backend:u # 14.62% backend cycles idle (74.74%) + 20,185,000,449 instructions:u # 3.44 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 1.687127609 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.587815e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.594524e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.594524e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.042010 sec - 2,820,248,202 cycles # 2.696 GHz - 7,037,051,518 instructions # 2.50 insn per cycle - 1.047085469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.353673e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.363916e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.363916e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.702805 sec + 2,501,986,684 cycles:u # 3.458 GHz (74.58%) + 1,043,648 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.58%) + 247,119,630 stalled-cycles-backend:u # 9.88% backend cycles idle (74.63%) + 7,101,537,921 instructions:u # 2.84 insn per cycle + # 0.03 stalled cycles per insn (74.88%) + 0.725376097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.892048e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.901381e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.901381e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.875630 sec - 2,480,888,017 cycles # 2.821 GHz - 6,279,220,836 instructions # 2.53 insn per cycle - 0.880698893 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.502673e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.508461e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.508461e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.100164 sec - 2,036,801,052 cycles # 1.845 GHz - 3,247,412,725 instructions # 1.59 insn per cycle - 1.105126246 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index c584ebcc69..dadb2ed2c2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_18:38:29 +DATE: 2024-01-28_13:14:51 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.305087e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.356788e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.363636e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.488004 sec - 2,004,585,151 cycles # 2.823 GHz - 2,949,271,873 instructions # 1.47 insn per cycle - 0.792976067 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.550772e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.750895e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.752340e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.434974 sec + 1,204,988,361 cycles:u # 2.634 GHz (75.54%) + 3,007,154 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.35%) + 50,869,111 stalled-cycles-backend:u # 4.22% backend cycles idle (74.85%) + 1,617,036,515 instructions:u # 1.34 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 0.476981080 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.500853e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.574463e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.577809e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.725709 sec - 5,827,437,776 cycles # 2.978 GHz - 11,866,403,549 instructions # 2.04 insn per cycle - 2.016529689 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.724634e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.755120e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.755557e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.286109 sec + 11,097,219,386 cycles:u # 3.349 GHz (74.96%) + 27,923,365 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.90%) + 1,135,219,349 stalled-cycles-backend:u # 10.23% backend cycles idle (74.93%) + 8,949,546,871 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.10%) + 3.333916008 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.046853e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.047911e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.047911e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.021570 sec - 24,208,728,571 cycles # 3.017 GHz - 75,801,225,088 instructions # 3.13 insn per cycle - 8.028618453 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.450904e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.451951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.451951e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.697264 sec + 23,515,902,426 cycles:u # 3.501 GHz (74.99%) + 1,383,417 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 2,323,931,551 stalled-cycles-backend:u # 9.88% backend cycles idle (75.00%) + 75,819,267,441 instructions:u # 3.22 insn per cycle + # 0.03 stalled cycles per insn (75.00%) + 6.720228011 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870430095556E-004 -Relative difference = 6.489572191632735e-09 +Avg ME (F77/C++) = 6.6274866108667618E-004 +Relative difference = 5.871505118544242e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.406347e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.420022e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.420022e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.223038 sec - 6,499,735,217 cycles # 2.919 GHz - 20,111,699,203 instructions # 3.09 insn per cycle - 2.235569341 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.891200e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.908824e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.908824e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.663830 sec + 5,882,752,910 cycles:u # 3.491 GHz (74.93%) + 765,449 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) + 839,987,494 stalled-cycles-backend:u # 14.28% backend cycles idle (74.84%) + 20,180,285,105 instructions:u # 3.43 insn per cycle + # 0.04 stalled cycles per insn (74.67%) + 1.688465440 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.666569e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.673138e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.673138e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.993136 sec - 2,812,239,791 cycles # 2.819 GHz - 7,037,687,814 instructions # 2.50 insn per cycle - 1.010358424 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.361789e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.371991e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.371991e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.700199 sec + 2,498,969,011 cycles:u # 3.464 GHz (74.61%) + 491,381 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.50%) + 310,187,854 stalled-cycles-backend:u # 12.41% backend cycles idle (74.23%) + 7,093,146,490 instructions:u # 2.84 insn per cycle + # 0.04 stalled cycles per insn (74.80%) + 0.724555225 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.868663e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.877654e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.877654e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.886341 sec - 2,477,207,877 cycles # 2.780 GHz - 6,280,122,323 instructions # 2.54 insn per cycle - 0.899576679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.506307e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512102e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512102e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.097481 sec - 2,036,096,996 cycles # 1.847 GHz - 3,247,608,048 instructions # 1.60 insn per cycle - 1.113849823 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index f3cd167bf2..60de9f177c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:15:25 +DATE: 2024-01-28_13:35:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.585722e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.624773e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.629910e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.491441 sec - 2,090,779,282 cycles # 2.923 GHz - 3,169,249,008 instructions # 1.52 insn per cycle - 0.776379492 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.579542e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.779793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.781143e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.431830 sec + 1,206,052,215 cycles:u # 2.657 GHz (75.12%) + 2,932,842 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.13%) + 51,275,589 stalled-cycles-backend:u # 4.25% backend cycles idle (75.31%) + 1,613,169,701 instructions:u # 1.34 insn per cycle + # 0.03 stalled cycles per insn (75.27%) + 0.476903668 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.689405e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.750347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.753010e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.857673 sec - 6,257,073,662 cycles # 2.985 GHz - 12,393,758,755 instructions # 1.98 insn per cycle - 2.152860178 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.695239e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.728316e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.728742e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.295663 sec + 11,138,698,712 cycles:u # 3.352 GHz (74.98%) + 27,875,365 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.98%) + 1,139,078,359 stalled-cycles-backend:u # 10.23% backend cycles idle (74.98%) + 9,010,928,708 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (74.95%) + 3.341936107 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.742749e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.743562e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.743562e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.566959 sec - 86,081,750,957 cycles # 3.014 GHz - 133,992,813,628 instructions # 1.56 insn per cycle - 28.571947196 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.256291e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.256974e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.256974e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 26.220009 sec + 91,948,371,418 cycles:u # 3.504 GHz (75.00%) + 504,800,131 stalled-cycles-frontend:u # 0.55% frontend cycles idle (75.00%) + 5,921,064,920 stalled-cycles-backend:u # 6.44% backend cycles idle (75.00%) + 134,071,955,136 instructions:u # 1.46 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 26.243295454 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275354356437610E-004 -Relative difference = 6.573239683366044e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627534e-04 +Avg ME (F77/C++) = 6.6275340697351248E-004 +Relative difference = 1.052203199451665e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.185851e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.199098e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.199098e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.291474 sec - 6,721,144,820 cycles # 2.928 GHz - 19,164,169,698 instructions # 2.85 insn per cycle - 2.297207626 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.174878e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.186794e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.186794e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 2.012411 sec + 7,095,681,533 cycles:u # 3.489 GHz (74.83%) + 5,456,142 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.83%) + 3,177,706,174 stalled-cycles-backend:u # 44.78% backend cycles idle (74.86%) + 19,220,540,740 instructions:u # 2.71 insn per cycle + # 0.17 stalled cycles per insn (75.01%) + 2.037081095 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859783433532E-004 -Relative difference = 3.2677016209485094e-09 +Avg ME (F77/C++) = 6.6274857053714997E-004 +Relative difference = 4.445554471174176e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.422882e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428217e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428217e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.161561 sec - 3,140,365,131 cycles # 2.695 GHz - 6,747,298,343 instructions # 2.15 insn per cycle - 1.166681407 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.464288e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.468259e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.468259e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.126244 sec + 3,986,800,812 cycles:u # 3.474 GHz (74.91%) + 619,881 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.91%) + 2,256,644,385 stalled-cycles-backend:u # 56.60% backend cycles idle (74.91%) + 6,771,749,492 instructions:u # 1.70 insn per cycle + # 0.33 stalled cycles per insn (74.92%) + 1.150735594 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735722101156E-004 +Relative difference = 6.454990161554483e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.799831e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.808210e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.808210e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.920308 sec - 2,609,208,209 cycles # 2.823 GHz - 5,931,137,835 instructions # 2.27 insn per cycle - 0.925334406 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.499022e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.504830e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.504830e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.103107 sec - 2,050,125,820 cycles # 1.852 GHz - 3,435,619,830 instructions # 1.68 insn per cycle - 1.108237965 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272748295826550E-004 -Relative difference = 2.5714542480216212e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index a8fa5a4097..02fee90070 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_19:16:18 +DATE: 2024-01-28_13:36:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.542946e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.582854e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.588303e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.489439 sec - 2,081,558,468 cycles # 2.919 GHz - 3,114,146,189 instructions # 1.50 insn per cycle - 0.773580082 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.572650e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.765978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.767488e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.434017 sec + 1,239,819,457 cycles:u # 2.720 GHz (73.81%) + 2,801,408 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.26%) + 40,578,551 stalled-cycles-backend:u # 3.27% backend cycles idle (75.96%) + 1,570,168,333 instructions:u # 1.27 insn per cycle + # 0.03 stalled cycles per insn (76.03%) + 0.476665475 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.658350e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.717978e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.720726e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.868545 sec - 6,214,634,155 cycles # 2.941 GHz - 12,997,699,197 instructions # 2.09 insn per cycle - 2.169650397 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.722157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.753832e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.754275e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.288731 sec + 11,072,404,348 cycles:u # 3.338 GHz (74.93%) + 27,707,536 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.14%) + 1,125,717,462 stalled-cycles-backend:u # 10.17% backend cycles idle (75.14%) + 8,983,411,192 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.04%) + 3.337368964 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.785903e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.786730e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.786730e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.353434 sec - 85,553,023,579 cycles # 3.017 GHz - 134,114,238,311 instructions # 1.57 insn per cycle - 28.358305784 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.152606e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.153268e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.153268e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 26.661499 sec + 93,493,285,356 cycles:u # 3.504 GHz (75.00%) + 446,484,923 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.00%) + 6,118,715,750 stalled-cycles-backend:u # 6.54% backend cycles idle (75.00%) + 133,974,562,362 instructions:u # 1.43 insn per cycle + # 0.05 stalled cycles per insn (75.00%) + 26.684773403 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627536e-04 -Avg ME (F77/C++) = 6.6275357377482830E-004 -Relative difference = 3.95700176737784e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275346486299042E-004 +Relative difference = 5.301670926116898e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.247256e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.260766e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.260766e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.271681 sec - 6,731,841,131 cycles # 2.958 GHz - 19,223,791,696 instructions # 2.86 insn per cycle - 2.277043695 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.264797e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.277274e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.277274e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.990366 sec + 7,025,594,276 cycles:u # 3.493 GHz (75.03%) + 1,878,173 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.95%) + 3,085,704,467 stalled-cycles-backend:u # 43.92% backend cycles idle (74.95%) + 19,241,600,987 instructions:u # 2.74 insn per cycle + # 0.16 stalled cycles per insn (74.95%) + 2.014940966 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859765498573E-004 -Relative difference = 3.538316437387639e-09 +Avg ME (F77/C++) = 6.6274857044990032E-004 +Relative difference = 4.4587192899226015e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.513571e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.519969e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.519969e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.092514 sec - 3,078,739,669 cycles # 2.808 GHz - 6,686,073,097 instructions # 2.17 insn per cycle - 1.097464200 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.497226e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.501331e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.501331e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.101382 sec + 3,919,935,527 cycles:u # 3.491 GHz (74.81%) + 1,339,023 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.99%) + 2,195,359,455 stalled-cycles-backend:u # 56.00% backend cycles idle (75.07%) + 6,707,924,892 instructions:u # 1.71 insn per cycle + # 0.33 stalled cycles per insn (75.07%) + 1.125989131 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735755491807E-004 +Relative difference = 6.404606472340801e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.791355e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.799542e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.799542e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.925307 sec - 2,608,678,296 cycles # 2.808 GHz - 5,935,673,927 instructions # 2.28 insn per cycle - 0.930643254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.496032e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.501544e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.501544e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.105011 sec - 2,047,507,476 cycles # 1.846 GHz - 3,422,770,262 instructions # 1.67 insn per cycle - 1.110043886 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272749650985591E-004 -Relative difference = 5.26633351741962e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index e5cbcb4d2f..639a7fabfa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_18:38:59 +DATE: 2024-01-28_13:15:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.451581e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.479933e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.482851e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.526837 sec - 2,251,816,336 cycles # 2.931 GHz - 3,479,387,094 instructions # 1.55 insn per cycle - 0.840260948 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.400495e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.571912e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.573006e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.644272 sec + 1,957,881,062 cycles:u # 2.941 GHz (74.75%) + 2,293,474 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.75%) + 42,495,223 stalled-cycles-backend:u # 2.17% backend cycles idle (74.69%) + 2,152,425,070 instructions:u # 1.10 insn per cycle + # 0.02 stalled cycles per insn (74.74%) + 0.688907615 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.122831e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157050e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.158521e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.051761 sec - 9,894,820,462 cycles # 2.987 GHz - 21,881,954,164 instructions # 2.21 insn per cycle - 3.372451658 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.243610e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.246656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.246717e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.369690 sec + 28,827,159,075 cycles:u # 3.432 GHz (74.95%) + 11,635,842 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.97%) + 1,119,173,150 stalled-cycles-backend:u # 3.88% backend cycles idle (75.04%) + 22,599,867,625 instructions:u # 0.78 insn per cycle + # 0.05 stalled cycles per insn (75.05%) + 8.420766112 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.855146e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.856019e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.856019e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.854089 sec - 26,804,815,526 cycles # 3.027 GHz - 82,458,196,081 instructions # 3.08 insn per cycle - 8.861301901 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.173414e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.174272e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.174272e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.553360 sec + 26,525,384,563 cycles:u # 3.502 GHz (74.97%) + 31,700,879 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.97%) + 3,907,005,284 stalled-cycles-backend:u # 14.73% backend cycles idle (74.97%) + 82,493,224,456 instructions:u # 3.11 insn per cycle + # 0.05 stalled cycles per insn (74.98%) + 7.576999233 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.669571e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.672901e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.672901e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.480009 sec - 12,632,329,369 cycles # 2.817 GHz - 38,536,772,938 instructions # 3.05 insn per cycle - 4.492192822 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.068024e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.072694e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.072694e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.243606 sec + 11,407,543,003 cycles:u # 3.494 GHz (75.01%) + 3,874,601 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.01%) + 1,233,532,875 stalled-cycles-backend:u # 10.81% backend cycles idle (75.01%) + 38,549,117,575 instructions:u # 3.38 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 3.268130380 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.441288e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.459099e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.459099e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.952916 sec - 5,537,101,197 cycles # 2.828 GHz - 13,582,628,802 instructions # 2.45 insn per cycle - 1.968634575 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.212166e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.214786e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.214786e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.360592 sec + 4,822,632,005 cycles:u # 3.490 GHz (74.69%) + 1,235,278 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) + 582,075,803 stalled-cycles-backend:u # 12.07% backend cycles idle (75.11%) + 13,597,926,852 instructions:u # 2.82 insn per cycle + # 0.04 stalled cycles per insn (75.11%) + 1.385071998 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.211711e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.232819e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.232819e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.790592 sec - 4,855,490,852 cycles # 2.715 GHz - 12,114,990,747 instructions # 2.50 insn per cycle - 1.806801715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.328139e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.341335e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.341335e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.249055 sec - 4,112,147,048 cycles # 1.825 GHz - 6,282,902,487 instructions # 1.53 insn per cycle - 2.263248047 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 67e828539c..69a8ae3caf 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-27_18:39:37 +DATE: 2024-01-28_13:15:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.479873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.507682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.510348e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.527103 sec - 2,250,115,667 cycles # 2.914 GHz - 3,494,224,929 instructions # 1.55 insn per cycle - 0.843300527 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.393338e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.453086e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.453532e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.527403 sec + 1,560,784,590 cycles:u # 2.845 GHz (75.01%) + 2,427,471 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.23%) + 33,307,693 stalled-cycles-backend:u # 2.13% backend cycles idle (75.23%) + 1,859,467,187 instructions:u # 1.19 insn per cycle + # 0.02 stalled cycles per insn (75.12%) + 0.568964990 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.150600e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.185411e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.186955e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.025723 sec - 9,795,236,195 cycles # 2.982 GHz - 22,232,844,164 instructions # 2.27 insn per cycle - 3.340401396 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.734585e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.740297e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.740406e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 7.037365 sec + 24,134,509,336 cycles:u # 3.414 GHz (74.99%) + 11,447,435 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.02%) + 1,098,766,355 stalled-cycles-backend:u # 4.55% backend cycles idle (75.02%) + 19,073,912,349 instructions:u # 0.79 insn per cycle + # 0.06 stalled cycles per insn (74.99%) + 7.089527013 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.849609e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.850477e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.850477e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.878906 sec - 26,785,691,422 cycles # 3.017 GHz - 82,362,112,849 instructions # 3.07 insn per cycle - 8.886184258 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.198791e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.199654e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.199654e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.465950 sec + 26,214,724,863 cycles:u # 3.501 GHz (75.00%) + 9,362,926 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) + 3,471,769,514 stalled-cycles-backend:u # 13.24% backend cycles idle (75.00%) + 82,354,955,326 instructions:u # 3.14 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 7.489806720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.594929e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598195e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598195e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.572287 sec - 12,660,755,558 cycles # 2.766 GHz - 38,557,577,434 instructions # 3.05 insn per cycle - 4.586940544 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.054822e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.059451e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.059451e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.251859 sec + 11,458,443,041 cycles:u # 3.501 GHz (74.79%) + 5,136,562 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) + 1,386,600,576 stalled-cycles-backend:u # 12.10% backend cycles idle (75.07%) + 38,564,688,924 instructions:u # 3.37 insn per cycle + # 0.04 stalled cycles per insn (75.07%) + 3.276523087 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.414226e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.432226e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.432226e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.959907 sec - 5,496,433,543 cycles # 2.798 GHz - 13,598,067,886 instructions # 2.47 insn per cycle - 1.971121515 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.217378e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.220027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220027e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.354625 sec + 4,781,705,033 cycles:u # 3.475 GHz (75.00%) + 852,815 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) + 503,647,282 stalled-cycles-backend:u # 10.53% backend cycles idle (75.00%) + 13,628,789,014 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 1.378976688 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.638152e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.661196e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.661196e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.711613 sec - 4,833,583,956 cycles # 2.816 GHz - 12,121,571,130 instructions # 2.51 insn per cycle - 1.724774904 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.463124e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.476511e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.476511e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.207311 sec - 4,092,914,427 cycles # 1.851 GHz - 6,289,060,943 instructions # 1.54 insn per cycle - 2.223904287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index b35a15bb6b..f084da33da 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_18:42:04 +DATE: 2024-01-28_13:17:07 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.066289e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.066701e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.066888e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.456687 sec - 8,215,788,812 cycles # 2.992 GHz - 18,531,874,132 instructions # 2.26 insn per cycle - 2.855724583 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.216667e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.222480e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.222544e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.245166 sec + 32,060,830,034 cycles:u # 3.459 GHz (74.96%) + 3,605,306 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 7,624,427 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) + 25,314,101,052 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 9.290369416 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.233994e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.236219e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.236519e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.000973 sec - 12,930,442,379 cycles # 2.989 GHz - 29,717,455,006 instructions # 2.30 insn per cycle - 4.383843078 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.552819e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.556593e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.556628e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 8.972202 sec + 31,074,350,767 cycles:u # 3.455 GHz (75.00%) + 3,811,519 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 47,730,762 stalled-cycles-backend:u # 0.15% backend cycles idle (74.97%) + 24,585,836,125 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 9.016806724 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.766343e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.766570e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.766570e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.792343 sec - 18,999,792,923 cycles # 2.797 GHz - 55,183,695,010 instructions # 2.90 insn per cycle - 6.800896289 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.024242e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024269e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024269e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.156616 sec + 18,118,586,594 cycles:u # 3.500 GHz (74.97%) + 30,638,307 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.97%) + 2,178,310,061 stalled-cycles-backend:u # 12.02% backend cycles idle (74.97%) + 55,166,446,419 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 5.179757033 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.590532e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.590619e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.590619e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.327161 sec - 9,802,926,786 cycles # 2.947 GHz - 27,058,073,971 instructions # 2.76 insn per cycle - 3.343348188 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.218638e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.218764e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.218764e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.381357 sec + 8,398,634,016 cycles:u # 3.496 GHz (75.00%) + 2,133,753 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.03%) + 856,109,149 stalled-cycles-backend:u # 10.19% backend cycles idle (75.03%) + 27,064,784,541 instructions:u # 3.22 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 2.405524535 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.407792e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.408212e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.408212e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.558543 sec - 4,283,401,276 cycles # 2.745 GHz - 9,566,262,483 instructions # 2.23 insn per cycle - 1.572820589 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.162229e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.162884e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.162884e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.024290 sec + 3,644,361,131 cycles:u # 3.487 GHz (74.90%) + 1,405,334 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.75%) + 329,012,774 stalled-cycles-backend:u # 9.03% backend cycles idle (74.75%) + 9,607,967,992 instructions:u # 2.64 insn per cycle + # 0.03 stalled cycles per insn (74.75%) + 1.048659292 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.081323e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.081986e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.081986e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.302229 sec - 3,691,612,964 cycles # 2.831 GHz - 8,451,252,932 instructions # 2.29 insn per cycle - 1.313920154 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.573280e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.573851e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.573851e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.491172 sec - 2,701,853,841 cycles # 1.809 GHz - 4,249,729,716 instructions # 1.57 insn per cycle - 1.510680407 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 397fb214c3..fc839a4e6f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_19:25:25 +DATE: 2024-01-28_13:48:15 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.063308e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.064264e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.064264e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.377552 sec - 7,911,457,318 cycles # 2.939 GHz - 17,776,798,693 instructions # 2.25 insn per cycle - 2.754050489 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.131548e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.132311e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.132311e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.335186 sec + 32,340,592,683 cycles:u # 3.457 GHz (74.97%) + 3,545,492 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 7,890,548 stalled-cycles-backend:u # 0.02% backend cycles idle (75.03%) + 25,521,625,842 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 9.383613134 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.220144e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.252068e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.252068e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.978881 sec - 12,946,564,273 cycles # 3.008 GHz - 28,230,768,091 instructions # 2.18 insn per cycle - 4.358810997 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.563489e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.567198e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.567198e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 8.943439 sec + 30,975,452,744 cycles:u # 3.454 GHz (74.91%) + 4,035,364 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) + 50,378,747 stalled-cycles-backend:u # 0.16% backend cycles idle (75.06%) + 24,492,244,124 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 8.988581838 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.364296e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.364564e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.364564e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.317070 sec - 19,020,611,022 cycles # 3.010 GHz - 55,181,937,523 instructions # 2.90 insn per cycle - 6.322014476 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.018492e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.018519e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.018519e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.185948 sec + 18,221,133,254 cycles:u # 3.500 GHz (74.96%) + 32,518,833 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.96%) + 2,083,666,753 stalled-cycles-backend:u # 11.44% backend cycles idle (74.96%) + 55,214,432,417 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 5.209020766 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.612783e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.612876e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.612876e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.279226 sec - 9,851,493,440 cycles # 3.001 GHz - 27,056,657,499 instructions # 2.75 insn per cycle - 3.284240204 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.240039e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.240165e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.240165e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.358616 sec + 8,325,714,841 cycles:u # 3.499 GHz (74.72%) + 1,164,186 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) + 804,408,576 stalled-cycles-backend:u # 9.66% backend cycles idle (75.13%) + 27,068,505,907 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.13%) + 2.382968221 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.513583e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.514021e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.514021e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.509181 sec - 4,246,413,990 cycles # 2.806 GHz - 9,565,193,031 instructions # 2.25 insn per cycle - 1.514258491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 5.196627e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.197309e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.197309e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.017831 sec + 3,611,820,548 cycles:u # 3.477 GHz (74.59%) + 1,044,871 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.65%) + 294,792,388 stalled-cycles-backend:u # 8.16% backend cycles idle (74.90%) + 9,609,700,878 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (75.23%) + 1.042164219 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.030244e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.030835e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.030835e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.315714 sec - 3,690,233,076 cycles # 2.797 GHz - 8,450,714,839 instructions # 2.29 insn per cycle - 1.320771644 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.617080e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.617634e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.617634e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.468596 sec - 2,687,627,626 cycles # 1.828 GHz - 4,249,748,844 instructions # 1.58 insn per cycle - 1.473675650 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 5295435e83..6421475a26 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_18:43:09 +DATE: 2024-01-28_13:18:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.071223e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.071654e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.071839e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.430534 sec - 8,241,056,474 cycles # 2.996 GHz - 17,668,710,833 instructions # 2.14 insn per cycle - 2.817128458 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.166030e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.172178e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.172267e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.270503 sec + 32,155,147,021 cycles:u # 3.460 GHz (74.94%) + 3,651,238 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 8,764,172 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) + 25,360,799,852 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 9.315668997 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.234923e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.237215e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.237486e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.996420 sec - 12,989,143,492 cycles # 3.004 GHz - 30,886,524,025 instructions # 2.38 insn per cycle - 4.378619453 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.562603e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.566151e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.566185e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 8.944569 sec + 31,010,331,579 cycles:u # 3.458 GHz (74.92%) + 3,835,223 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 48,978,807 stalled-cycles-backend:u # 0.16% backend cycles idle (75.02%) + 24,484,773,895 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.08%) + 8.988702643 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.363577e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.363813e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.363813e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.317358 sec - 18,899,007,109 cycles # 2.990 GHz - 55,157,851,644 instructions # 2.92 insn per cycle - 6.322297725 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.023285e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.023312e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.023312e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.161102 sec + 18,130,215,132 cycles:u # 3.499 GHz (74.99%) + 28,376,417 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.99%) + 2,222,279,148 stalled-cycles-backend:u # 12.26% backend cycles idle (74.99%) + 55,131,264,986 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 5.191594222 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.616667e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.616762e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.616762e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.272043 sec - 9,878,457,696 cycles # 3.015 GHz - 27,063,242,954 instructions # 2.74 insn per cycle - 3.284292649 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.239446e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.239571e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.239571e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.358940 sec + 8,320,304,916 cycles:u # 3.494 GHz (74.81%) + 674,914 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.85%) + 760,756,627 stalled-cycles-backend:u # 9.14% backend cycles idle (75.02%) + 27,081,649,620 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.14%) + 2.400337906 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.539659e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.540140e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.540140e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.497660 sec - 4,224,561,051 cycles # 2.813 GHz - 9,568,643,351 instructions # 2.27 insn per cycle - 1.508422323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.235519e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.236216e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.236216e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.009895 sec + 3,594,869,031 cycles:u # 3.487 GHz (74.78%) + 651,676 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.11%) + 292,999,809 stalled-cycles-backend:u # 8.15% backend cycles idle (75.17%) + 9,584,413,482 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (75.18%) + 1.033862475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.989197e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.989799e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.989799e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.328783 sec - 3,741,136,636 cycles # 2.806 GHz - 8,454,541,804 instructions # 2.26 insn per cycle - 1.342771339 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.611038e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.611578e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.611578e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.470091 sec - 2,681,169,729 cycles # 1.819 GHz - 4,250,171,659 instructions # 1.59 insn per cycle - 1.480911575 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index c5db4e21ff..bbce0efa88 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_18:44:13 +DATE: 2024-01-28_13:20:15 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.761759e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.762767e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.763202e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.665455 sec - 5,816,169,406 cycles # 2.981 GHz - 12,222,800,082 instructions # 2.10 insn per cycle - 2.007943438 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.873337e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.877052e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.877089e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 +TOTAL : 4.380410 sec + 15,030,892,755 cycles:u # 3.412 GHz (75.01%) + 2,743,911 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 6,200,580 stalled-cycles-backend:u # 0.04% backend cycles idle (74.87%) + 12,235,328,339 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 4.427625896 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.318123e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.318911e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.319041e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.920955 sec - 6,578,417,034 cycles # 2.994 GHz - 13,330,489,318 instructions # 2.03 insn per cycle - 2.257185780 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.374633e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.392593e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.392749e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 +TOTAL : 4.650818 sec + 15,950,335,393 cycles:u # 3.415 GHz (75.00%) + 3,205,766 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) + 59,561,725 stalled-cycles-backend:u # 0.37% backend cycles idle (75.03%) + 12,953,310,098 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 4.691000798 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.995520e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.995824e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.995824e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.875855 sec - 17,760,593,846 cycles # 3.021 GHz - 51,788,652,435 instructions # 2.92 insn per cycle - 5.880892508 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.091774e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.091804e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.091804e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.837783 sec + 17,016,597,288 cycles:u # 3.503 GHz (74.98%) + 15,666,867 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.97%) + 1,915,711,543 stalled-cycles-backend:u # 11.26% backend cycles idle (74.97%) + 51,809,157,108 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 4.860535262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.478616e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.479057e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.479057e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.522886 sec - 4,542,905,565 cycles # 2.976 GHz - 13,759,097,003 instructions # 3.03 insn per cycle - 1.528290141 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.584820e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.585355e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.585355e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.153192 sec + 4,090,224,367 cycles:u # 3.483 GHz (74.80%) + 419,368 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) + 378,967,939 stalled-cycles-backend:u # 9.27% backend cycles idle (74.80%) + 13,802,300,433 instructions:u # 3.37 insn per cycle + # 0.03 stalled cycles per insn (74.85%) + 1.177425051 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.037647e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.039406e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.039406e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.756451 sec - 2,139,969,692 cycles # 2.814 GHz - 4,825,813,699 instructions # 2.26 insn per cycle - 0.761605193 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.028883e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029149e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029149e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.514884 sec + 1,859,416,874 cycles:u # 3.470 GHz (74.84%) + 756,642 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.63%) + 160,714,291 stalled-cycles-backend:u # 8.64% backend cycles idle (74.62%) + 4,863,962,985 instructions:u # 2.62 insn per cycle + # 0.03 stalled cycles per insn (74.62%) + 0.538920469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.942628e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.944917e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.944917e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.670660 sec - 1,891,175,209 cycles # 2.803 GHz - 4,258,254,106 instructions # 2.25 insn per cycle - 0.675764794 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.220293e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.222452e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.222452e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.738034 sec - 1,362,584,884 cycles # 1.836 GHz - 2,147,140,575 instructions # 1.58 insn per cycle - 0.743262931 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index d666735ca2..36f73d1d8e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_19:26:29 +DATE: 2024-01-28_13:49:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.806421e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.808416e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.808416e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.603735 sec - 5,618,263,113 cycles # 2.992 GHz - 11,482,646,019 instructions # 2.04 insn per cycle - 1.935006282 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.851281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.851679e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.851679e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 +TOTAL : 4.420923 sec + 15,136,537,681 cycles:u # 3.405 GHz (75.00%) + 2,717,273 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) + 7,517,456 stalled-cycles-backend:u # 0.05% backend cycles idle (75.04%) + 12,304,157,182 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 4.467019241 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.340764e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.354569e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.354569e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.886823 sec - 6,496,931,993 cycles # 3.000 GHz - 13,298,151,535 instructions # 2.05 insn per cycle - 2.222816329 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.366666e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.382282e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.382282e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 +TOTAL : 4.659656 sec + 15,995,419,535 cycles:u # 3.416 GHz (74.89%) + 3,646,164 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) + 57,220,683 stalled-cycles-backend:u # 0.36% backend cycles idle (75.04%) + 12,945,746,501 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.11%) + 4.704619645 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.077212e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.077512e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.077512e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.822432 sec - 17,590,382,766 cycles # 3.020 GHz - 51,787,107,788 instructions # 2.94 insn per cycle - 5.827233952 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.090660e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.090691e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.090691e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.842680 sec + 17,022,170,540 cycles:u # 3.500 GHz (75.00%) + 16,161,742 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.00%) + 1,836,330,502 stalled-cycles-backend:u # 10.79% backend cycles idle (75.00%) + 51,787,562,489 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 4.865545899 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.522700e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.523145e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.523145e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.505118 sec - 4,557,873,954 cycles # 3.020 GHz - 13,759,118,019 instructions # 3.02 insn per cycle - 1.510488242 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.577962e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.578491e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.578491e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.155076 sec + 4,094,436,058 cycles:u # 3.482 GHz (74.83%) + 878,794 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.83%) + 393,883,989 stalled-cycles-backend:u # 9.62% backend cycles idle (74.83%) + 13,810,704,758 instructions:u # 3.37 insn per cycle + # 0.03 stalled cycles per insn (74.73%) + 1.179224752 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.001081e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.002839e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.002839e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.760817 sec - 2,141,009,943 cycles # 2.800 GHz - 4,826,771,994 instructions # 2.25 insn per cycle - 0.765894845 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.036106e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.036381e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.036381e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.511703 sec + 1,838,496,544 cycles:u # 3.451 GHz (74.48%) + 814,316 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.48%) + 161,202,908 stalled-cycles-backend:u # 8.77% backend cycles idle (74.01%) + 4,877,499,492 instructions:u # 2.65 insn per cycle + # 0.03 stalled cycles per insn (74.77%) + 0.535953009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.013615e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.015939e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.015939e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.664865 sec - 1,881,921,417 cycles # 2.814 GHz - 4,259,199,583 instructions # 2.26 insn per cycle - 0.669793712 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.319978e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.322349e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.322349e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.728687 sec - 1,353,147,876 cycles # 1.847 GHz - 2,147,964,410 instructions # 1.59 insn per cycle - 0.733659134 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 1452fdeeca..fe846c064e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_18:45:01 +DATE: 2024-01-28_13:21:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.763283e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.764164e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.764726e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.666806 sec - 5,766,686,507 cycles # 2.966 GHz - 11,530,148,505 instructions # 2.00 insn per cycle - 2.001410376 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.835082e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.838474e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.838511e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 +TOTAL : 4.428832 sec + 15,205,315,081 cycles:u # 3.416 GHz (74.97%) + 2,734,544 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) + 6,675,019 stalled-cycles-backend:u # 0.04% backend cycles idle (75.02%) + 12,381,921,660 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 4.474102149 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.328340e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.329144e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329276e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.924938 sec - 6,578,277,944 cycles # 2.973 GHz - 13,443,420,241 instructions # 2.04 insn per cycle - 2.272640588 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.370419e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.388480e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.388559e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 +TOTAL : 4.639032 sec + 15,939,291,676 cycles:u # 3.420 GHz (74.96%) + 3,288,892 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) + 53,583,558 stalled-cycles-backend:u # 0.34% backend cycles idle (74.87%) + 12,929,386,951 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 4.680410352 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.038426e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.038714e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.038714e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.850564 sec - 17,645,414,933 cycles # 3.015 GHz - 51,759,467,370 instructions # 2.93 insn per cycle - 5.855310396 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.088738e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.088769e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.088769e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.850413 sec + 17,074,036,603 cycles:u # 3.505 GHz (74.89%) + 17,496,903 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.97%) + 1,831,911,248 stalled-cycles-backend:u # 10.73% backend cycles idle (75.04%) + 51,777,914,318 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 4.873262774 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087313262E-003 -Relative difference = 2.1195385077844924e-08 +Avg ME (F77/C++) = 9.8479612087396841E-003 +Relative difference = 2.119623377106246e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.517615e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.518094e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.518094e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.507049 sec - 4,549,371,148 cycles # 3.011 GHz - 13,756,628,094 instructions # 3.02 insn per cycle - 1.512083566 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.573905e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.574445e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.574445e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.155677 sec + 4,095,922,221 cycles:u # 3.482 GHz (74.84%) + 659,909 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) + 437,658,752 stalled-cycles-backend:u # 10.69% backend cycles idle (74.85%) + 13,797,253,519 instructions:u # 3.37 insn per cycle + # 0.03 stalled cycles per insn (74.74%) + 1.179554790 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.072859e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.074649e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.074649e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.752766 sec - 2,125,890,254 cycles # 2.809 GHz - 4,825,175,981 instructions # 2.27 insn per cycle - 0.757781550 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.033303e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033568e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.033568e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.512278 sec + 1,841,408,148 cycles:u # 3.454 GHz (74.50%) + 748,846 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.50%) + 156,642,601 stalled-cycles-backend:u # 8.51% backend cycles idle (74.50%) + 4,868,507,625 instructions:u # 2.64 insn per cycle + # 0.03 stalled cycles per insn (74.81%) + 0.536248778 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.077641e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.080007e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.080007e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.660009 sec - 1,859,970,750 cycles # 2.801 GHz - 4,257,370,977 instructions # 2.29 insn per cycle - 0.665023060 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.821540e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.823711e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.823711e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.780669 sec - 1,354,191,609 cycles # 1.726 GHz - 2,146,469,836 instructions # 1.59 insn per cycle - 0.785741588 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index d5b3f2a192..e92987a82e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_18:45:49 +DATE: 2024-01-28_13:22:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.695527e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.696068e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.696270e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.173361 sec - 7,464,389,868 cycles # 2.992 GHz - 15,636,784,035 instructions # 2.09 insn per cycle - 2.552006424 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.693759e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.698928e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.698967e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.637826 sec + 33,435,746,973 cycles:u # 3.461 GHz (74.92%) + 3,516,680 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 9,721,702 stalled-cycles-backend:u # 0.03% backend cycles idle (75.00%) + 26,386,914,135 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 9.682444440 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.114011e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.114348e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.114390e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.404734 sec - 11,172,869,025 cycles # 2.990 GHz - 24,656,254,293 instructions # 2.21 insn per cycle - 3.795433378 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.328832e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332021e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332051e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.254842 sec + 32,049,610,632 cycles:u # 3.455 GHz (75.00%) + 3,777,778 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 47,547,022 stalled-cycles-backend:u # 0.15% backend cycles idle (74.99%) + 25,349,562,849 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 9.296621371 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.240396e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.240663e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.240663e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.423863 sec - 19,317,508,811 cycles # 3.006 GHz - 55,390,994,540 instructions # 2.87 insn per cycle - 6.429032251 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.015381e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.015408e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.015408e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.201480 sec + 18,286,175,540 cycles:u # 3.502 GHz (74.98%) + 33,801,798 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.03%) + 2,216,780,930 stalled-cycles-backend:u # 12.12% backend cycles idle (75.03%) + 55,369,852,802 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 5.224195387 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.584594e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.584688e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.584688e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.340379 sec - 9,375,006,047 cycles # 2.805 GHz - 25,873,963,058 instructions # 2.76 insn per cycle - 3.345266262 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.348883e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.349021e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.349021e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.249693 sec + 7,932,943,263 cycles:u # 3.494 GHz (74.99%) + 565,471 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 777,441,520 stalled-cycles-backend:u # 9.80% backend cycles idle (74.99%) + 25,879,995,949 instructions:u # 3.26 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 2.273845421 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.735273e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.735783e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.735783e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.419524 sec - 4,002,951,295 cycles # 2.813 GHz - 9,118,794,093 instructions # 2.28 insn per cycle - 1.424775624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.486925e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.487671e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.487671e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.964011 sec + 3,425,216,696 cycles:u # 3.478 GHz (74.83%) + 1,589,039 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.83%) + 294,874,215 stalled-cycles-backend:u # 8.61% backend cycles idle (74.83%) + 9,130,471,265 instructions:u # 2.67 insn per cycle + # 0.03 stalled cycles per insn (74.83%) + 0.988073640 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.294481e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.295107e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.295107e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.235904 sec - 3,506,973,693 cycles # 2.829 GHz - 8,028,693,608 instructions # 2.29 insn per cycle - 1.240980401 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.758822e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.759478e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.759478e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.411257 sec - 2,599,112,782 cycles # 1.836 GHz - 4,074,815,519 instructions # 1.57 insn per cycle - 1.416720153 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index d966c21cae..ac7918cccb 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-27_18:46:50 +DATE: 2024-01-28_13:23:45 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691341e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.691858e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.692072e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.174115 sec - 7,484,543,519 cycles # 2.999 GHz - 16,704,419,809 instructions # 2.23 insn per cycle - 2.552906187 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.823155e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.829000e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.829089e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.511767 sec + 32,984,403,917 cycles:u # 3.459 GHz (74.98%) + 3,473,100 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 7,692,372 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) + 26,075,654,881 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 9.559020948 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.104165e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.104486e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.104527e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.414511 sec - 11,184,338,121 cycles # 2.992 GHz - 24,773,735,470 instructions # 2.22 insn per cycle - 3.797778444 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.339492e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.347850e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.347878e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.263600 sec + 32,074,175,723 cycles:u # 3.457 GHz (74.96%) + 3,798,204 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 47,349,192 stalled-cycles-backend:u # 0.15% backend cycles idle (74.99%) + 25,336,600,811 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 9.309754183 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.732242e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.732454e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.732454e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.829477 sec - 19,209,542,583 cycles # 2.812 GHz - 55,420,380,527 instructions # 2.89 insn per cycle - 6.834484266 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.024597e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024625e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024625e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.154454 sec + 18,118,041,561 cycles:u # 3.501 GHz (74.96%) + 27,127,211 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.96%) + 2,172,862,275 stalled-cycles-backend:u # 11.99% backend cycles idle (74.96%) + 55,424,029,564 instructions:u # 3.06 insn per cycle + # 0.04 stalled cycles per insn (74.97%) + 5.177178107 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.609123e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.609214e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.609214e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.289973 sec - 9,309,252,956 cycles # 2.828 GHz - 25,822,376,754 instructions # 2.77 insn per cycle - 3.295080287 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.352094e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.352235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.352235e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.246098 sec + 7,928,069,177 cycles:u # 3.497 GHz (75.01%) + 1,035,429 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 811,803,832 stalled-cycles-backend:u # 10.24% backend cycles idle (74.95%) + 25,862,084,099 instructions:u # 3.26 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 2.270192131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.752434e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.752989e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.752989e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.414298 sec - 3,995,880,255 cycles # 2.817 GHz - 9,098,295,505 instructions # 2.28 insn per cycle - 1.419531270 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.488000e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.488738e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.488738e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.963504 sec + 3,426,340,904 cycles:u # 3.481 GHz (74.81%) + 1,278,872 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.81%) + 300,861,366 stalled-cycles-backend:u # 8.78% backend cycles idle (74.81%) + 9,116,212,703 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (74.82%) + 0.987905106 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.317543e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.318199e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.318199e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.229231 sec - 3,482,074,372 cycles # 2.824 GHz - 8,009,633,949 instructions # 2.30 insn per cycle - 1.234207236 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.719747e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.720359e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.720359e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.425822 sec - 2,595,022,817 cycles # 1.815 GHz - 4,064,590,341 instructions # 1.57 insn per cycle - 1.430892684 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 4462f3455a..a1484b6f26 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_18:40:14 +DATE: 2024-01-28_13:16:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.639719e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.238275e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.606410e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.450396 sec - 1,947,310,059 cycles # 2.928 GHz - 2,755,033,645 instructions # 1.41 insn per cycle - 0.740603136 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 742,362,445 cycles:u # 0.759 GHz (75.80%) + 2,746,219 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.82%) + 37,618,922 stalled-cycles-backend:u # 5.07% backend cycles idle (75.84%) + 1,244,048,946 instructions:u # 1.68 insn per cycle + # 0.03 stalled cycles per insn (74.19%) + 1.005676312 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.241115e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.110220e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.531234e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.533761 sec - 2,265,806,755 cycles # 2.927 GHz - 3,221,324,415 instructions # 1.42 insn per cycle - 0.832968038 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 2,664,918,285 cycles:u # 2.823 GHz (74.95%) + 21,119,267 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.24%) + 863,828,171 stalled-cycles-backend:u # 32.41% backend cycles idle (74.29%) + 2,528,341,905 instructions:u # 0.95 insn per cycle + # 0.34 stalled cycles per insn (74.95%) + 0.965928637 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x693a290) on address 0x1460ca129000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x146460faf372 in ??? +#1 0x146460fae505 in ??? +#2 0x14645f4a2dbf in ??? +#3 0x14645f4a2d2b in ??? +#4 0x14645f4a43e4 in ??? +#5 0x146457975b64 in ??? +#6 0x146457972b38 in ??? +#7 0x146457930496 in ??? +#8 0x14645f43c6e9 in ??? +#9 0x14645f57049e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.020352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.042032e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.042032e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.629272 sec - 4,894,825,472 cycles # 2.998 GHz - 13,801,188,692 instructions # 2.82 insn per cycle - 1.636458250 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.173059e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192781e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192781e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.419980 sec + 5,025,685,634 cycles:u # 3.485 GHz (74.91%) + 2,331,310 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) + 671,647,332 stalled-cycles-backend:u # 13.36% backend cycles idle (75.04%) + 13,827,806,422 instructions:u # 2.75 insn per cycle + # 0.05 stalled cycles per insn (75.04%) + 1.444043088 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.977778e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.057292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.057292e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.851626 sec - 2,574,234,654 cycles # 3.006 GHz - 7,401,126,330 instructions # 2.88 insn per cycle - 0.868169150 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.313825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.541459e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.541459e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.517367 sec - 1,479,430,994 cycles # 2.833 GHz - 3,136,844,595 instructions # 2.12 insn per cycle - 0.530727828 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.745507e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.030695e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.030695e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.460291 sec - 1,313,909,527 cycles # 2.825 GHz - 2,923,525,061 instructions # 2.23 insn per cycle - 0.475368269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.584731e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.722315e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.722315e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.658817 sec - 1,274,030,369 cycles # 1.920 GHz - 1,899,828,951 instructions # 1.49 insn per cycle - 0.672755043 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x667850) on address 0x1454f3e09000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index fa3b42477f..68e3ce30cc 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,117 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_19:23:42 +DATE: 2024-01-28_13:47:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.534750e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.106106e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.106106e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.477418 sec - 2,013,690,496 cycles # 2.928 GHz - 3,000,584,425 instructions # 1.49 insn per cycle - 0.747869861 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 955,437,706 cycles:u # 2.461 GHz (75.24%) + 2,971,678 stalled-cycles-frontend:u # 0.31% frontend cycles idle (75.50%) + 29,904,537 stalled-cycles-backend:u # 3.13% backend cycles idle (74.96%) + 1,413,759,904 instructions:u # 1.48 insn per cycle + # 0.02 stalled cycles per insn (74.79%) + 0.553131920 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.202690e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.270294e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.270294e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.760124 sec - 2,966,783,718 cycles # 2.930 GHz - 4,516,593,875 instructions # 1.52 insn per cycle - 1.070193173 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 3,241,797,323 cycles:u # 2.861 GHz (74.89%) + 30,143,228 stalled-cycles-frontend:u # 0.93% frontend cycles idle (75.30%) + 856,597,171 stalled-cycles-backend:u # 26.42% backend cycles idle (75.27%) + 3,336,680,100 instructions:u # 1.03 insn per cycle + # 0.26 stalled cycles per insn (75.27%) + 1.407429317 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x693a290) on address 0x14694c129000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x146ce2f9f372 in ??? +#1 0x146ce2f9e505 in ??? +#2 0x146ce1492dbf in ??? +#3 0x146ce1492d2b in ??? +#4 0x146ce14943e4 in ??? +#5 0x146cd9965b64 in ??? +#6 0x146cd9962b38 in ??? +#7 0x146cd9920496 in ??? +#8 0x146ce142c6e9 in ??? +#9 0x146ce156049e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.021921e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.043570e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.633650 sec - 4,930,350,150 cycles # 3.012 GHz - 13,805,734,405 instructions # 2.80 insn per cycle - 1.639181796 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.173197e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192894e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192894e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.423616 sec + 5,029,275,447 cycles:u # 3.480 GHz (74.88%) + 2,400,452 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.09%) + 687,995,818 stalled-cycles-backend:u # 13.68% backend cycles idle (75.09%) + 13,816,443,402 instructions:u # 2.75 insn per cycle + # 0.05 stalled cycles per insn (75.10%) + 1.447260926 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.966319e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.048340e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.048340e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.863129 sec - 2,612,967,702 cycles # 3.012 GHz - 7,448,123,812 instructions # 2.85 insn per cycle - 0.868429334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.268267e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.501838e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501838e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.533513 sec - 1,524,823,949 cycles # 2.836 GHz - 3,186,924,139 instructions # 2.09 insn per cycle - 0.538790954 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.690302e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.973964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.973964e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.474007 sec - 1,354,698,256 cycles # 2.832 GHz - 2,971,821,426 instructions # 2.19 insn per cycle - 0.479578735 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.556123e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.696455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.696455e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.674208 sec - 1,321,719,887 cycles # 1.949 GHz - 1,936,933,525 instructions # 1.47 insn per cycle - 0.679606999 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x667850) on address 0x1467fdb79000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index f8687d6f23..2b6e489945 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_18:40:33 +DATE: 2024-01-28_13:16:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.642634e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.223079e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.583744e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.452291 sec - 1,896,382,083 cycles # 2.843 GHz - 2,657,764,456 instructions # 1.40 insn per cycle - 0.741104609 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 722,585,158 cycles:u # 2.137 GHz (75.90%) + 2,599,585 stalled-cycles-frontend:u # 0.36% frontend cycles idle (76.10%) + 39,487,772 stalled-cycles-backend:u # 5.46% backend cycles idle (75.40%) + 1,266,219,372 instructions:u # 1.75 insn per cycle + # 0.03 stalled cycles per insn (71.84%) + 0.360995823 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.219834e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.980842e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.384273e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.535658 sec - 2,270,552,244 cycles # 2.929 GHz - 3,223,240,823 instructions # 1.42 insn per cycle - 0.833108106 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 2,673,014,627 cycles:u # 2.809 GHz (74.77%) + 21,614,665 stalled-cycles-frontend:u # 0.81% frontend cycles idle (73.29%) + 863,129,959 stalled-cycles-backend:u # 32.29% backend cycles idle (74.35%) + 2,509,612,374 instructions:u # 0.94 insn per cycle + # 0.34 stalled cycles per insn (76.13%) + 0.970743750 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x693a1e0) on address 0x14e47c679000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14e7134f5372 in ??? +#1 0x14e7134f4505 in ??? +#2 0x14e7119e8dbf in ??? +#3 0x14e7119e8d2b in ??? +#4 0x14e7119ea3e4 in ??? +#5 0x14e709ebbb64 in ??? +#6 0x14e709eb8b38 in ??? +#7 0x14e709e76496 in ??? +#8 0x14e7119826e9 in ??? +#9 0x14e711ab649e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.030327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.051785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.051785e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.612977 sec - 4,885,882,007 cycles # 3.021 GHz - 13,807,686,866 instructions # 2.83 insn per cycle - 1.620453863 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.173950e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193612e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193612e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.418629 sec + 5,015,341,842 cycles:u # 3.484 GHz (75.00%) + 2,143,149 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) + 879,065,341 stalled-cycles-backend:u # 17.53% backend cycles idle (75.00%) + 13,836,256,912 instructions:u # 2.76 insn per cycle + # 0.06 stalled cycles per insn (75.00%) + 1.441391086 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.984699e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.065832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.065832e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.848511 sec - 2,571,840,623 cycles # 3.014 GHz - 7,406,641,051 instructions # 2.88 insn per cycle - 0.865785135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.247727e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466178e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466178e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.527497 sec - 1,486,490,353 cycles # 2.792 GHz - 3,137,529,820 instructions # 2.11 insn per cycle - 0.543813521 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.763651e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.053882e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.053882e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.458301 sec - 1,313,225,777 cycles # 2.836 GHz - 2,925,437,911 instructions # 2.23 insn per cycle - 0.472801819 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.598173e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.735760e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.735760e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.655276 sec - 1,273,182,235 cycles # 1.928 GHz - 1,899,785,192 instructions # 1.49 insn per cycle - 0.668056369 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6664d0) on address 0x14bb63199000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 255696cfca..1700562c87 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_18:40:52 +DATE: 2024-01-28_13:16:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.276030e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.197789e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.334861e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.446424 sec - 1,957,975,833 cycles # 2.902 GHz - 2,724,877,606 instructions # 1.39 insn per cycle - 0.752539197 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 749,803,031 cycles:u # 2.249 GHz (74.35%) + 2,542,073 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.95%) + 32,510,347 stalled-cycles-backend:u # 4.34% backend cycles idle (76.93%) + 1,235,780,352 instructions:u # 1.65 insn per cycle + # 0.03 stalled cycles per insn (75.33%) + 0.357181764 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.177695e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.803706e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.957397e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.477570 sec - 2,061,739,652 cycles # 2.925 GHz - 2,934,253,634 instructions # 1.42 insn per cycle - 0.762376186 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 2,487,432,579 cycles:u # 2.810 GHz (75.50%) + 21,154,833 stalled-cycles-frontend:u # 0.85% frontend cycles idle (75.28%) + 860,477,044 stalled-cycles-backend:u # 34.59% backend cycles idle (74.55%) + 2,480,935,938 instructions:u # 1.00 insn per cycle + # 0.35 stalled cycles per insn (75.06%) + 0.905942536 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x69382b0) on address 0x14696161c000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x146bf848d372 in ??? +#1 0x146bf848c505 in ??? +#2 0x146bf6982dbf in ??? +#3 0x146bf6982d2b in ??? +#4 0x146bf69843e4 in ??? +#5 0x146beee55b64 in ??? +#6 0x146beee52b38 in ??? +#7 0x146beee10496 in ??? +#8 0x146bf691c6e9 in ??? +#9 0x146bf6a5049e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.156011e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183795e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183795e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.439630 sec - 4,349,945,055 cycles # 3.014 GHz - 12,597,057,594 instructions # 2.90 insn per cycle - 1.446297084 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.428982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.459192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.459192e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.167732 sec + 4,148,960,302 cycles:u # 3.491 GHz (74.76%) + 2,278,304 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.05%) + 246,543,940 stalled-cycles-backend:u # 5.94% backend cycles idle (75.10%) + 12,642,512,101 instructions:u # 3.05 insn per cycle + # 0.02 stalled cycles per insn (75.11%) + 1.190458747 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.236464e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.466077e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.466077e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.527817 sec - 1,596,321,277 cycles # 2.995 GHz - 4,246,542,438 instructions # 2.66 insn per cycle - 0.539092243 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.734911e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.446877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.446877e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.307736 sec - 852,622,477 cycles # 2.728 GHz - 1,915,871,925 instructions # 2.25 insn per cycle - 0.320337070 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.511200e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.444345e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.444345e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.272725 sec - 782,291,543 cycles # 2.820 GHz - 1,797,558,112 instructions # 2.30 insn per cycle - 0.287215058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.868013e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.379295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.379295e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.359796 sec - 720,473,569 cycles # 1.977 GHz - 1,287,790,296 instructions # 1.79 insn per cycle - 0.372596873 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x645490) on address 0x14bb8f15c000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 8680fe2d29..f7ef249b0b 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,117 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_19:24:00 +DATE: 2024-01-28_13:47:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.329665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.932243e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.932243e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.458470 sec - 1,932,748,670 cycles # 2.884 GHz - 2,880,300,848 instructions # 1.49 insn per cycle - 0.747483766 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 733,284,638 cycles:u # 2.220 GHz (75.38%) + 2,873,339 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.28%) + 39,969,279 stalled-cycles-backend:u # 5.45% backend cycles idle (74.66%) + 1,260,934,352 instructions:u # 1.72 insn per cycle + # 0.03 stalled cycles per insn (75.93%) + 0.359797530 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.128345e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.590470e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.590470e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.625994 sec - 2,512,334,849 cycles # 2.925 GHz - 3,779,505,778 instructions # 1.50 insn per cycle - 0.916809729 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 2,819,121,395 cycles:u # 2.794 GHz (75.71%) + 28,954,822 stalled-cycles-frontend:u # 1.03% frontend cycles idle (75.44%) + 853,775,539 stalled-cycles-backend:u # 30.29% backend cycles idle (75.44%) + 3,119,187,632 instructions:u # 1.11 insn per cycle + # 0.27 stalled cycles per insn (75.40%) + 1.028219917 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x69382b0) on address 0x15013b1bc000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x1503d202b372 in ??? +#1 0x1503d202a505 in ??? +#2 0x1503d0520dbf in ??? +#3 0x1503d0520d2b in ??? +#4 0x1503d05223e4 in ??? +#5 0x1503c89f3b64 in ??? +#6 0x1503c89f0b38 in ??? +#7 0x1503c89ae496 in ??? +#8 0x1503d04ba6e9 in ??? +#9 0x1503d05ee49e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.152331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.180767e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.180767e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.447405 sec - 4,369,343,989 cycles # 3.011 GHz - 12,600,604,701 instructions # 2.88 insn per cycle - 1.452464327 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.427180e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.457374e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.457374e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.171262 sec + 4,152,238,121 cycles:u # 3.482 GHz (74.65%) + 2,263,727 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.95%) + 248,276,893 stalled-cycles-backend:u # 5.98% backend cycles idle (75.18%) + 12,634,997,297 instructions:u # 3.04 insn per cycle + # 0.02 stalled cycles per insn (75.19%) + 1.194742461 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.208026e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.437849e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.437849e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.537081 sec - 1,624,227,785 cycles # 3.000 GHz - 4,293,772,611 instructions # 2.64 insn per cycle - 0.542420637 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.879547e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.646923e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.646923e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.304262 sec - 874,283,993 cycles # 2.836 GHz - 1,951,967,000 instructions # 2.23 insn per cycle - 0.309391709 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.999359e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.842417e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.842417e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.300020 sec - 806,296,956 cycles # 2.668 GHz - 1,834,830,287 instructions # 2.28 insn per cycle - 0.305527883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.823582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.321298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.321298e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.367386 sec - 744,117,486 cycles # 2.002 GHz - 1,329,029,664 instructions # 1.79 insn per cycle - 0.372710510 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x645490) on address 0x1493c8844000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 490050e744..d199db87ba 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_18:41:10 +DATE: 2024-01-28_13:16:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.294931e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.189738e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329663e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.445614 sec - 1,926,682,589 cycles # 2.902 GHz - 2,707,428,922 instructions # 1.41 insn per cycle - 0.733083595 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 729,890,155 cycles:u # 2.185 GHz (76.20%) + 2,776,032 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.23%) + 41,483,645 stalled-cycles-backend:u # 5.68% backend cycles idle (71.44%) + 1,265,802,703 instructions:u # 1.73 insn per cycle + # 0.03 stalled cycles per insn (73.74%) + 0.356567682 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.149926e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.777474e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.917051e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.479694 sec - 2,060,682,910 cycles # 2.910 GHz - 2,931,090,456 instructions # 1.42 insn per cycle - 0.766292085 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 2,586,507,860 cycles:u # 2.874 GHz (74.53%) + 20,964,775 stalled-cycles-frontend:u # 0.81% frontend cycles idle (75.43%) + 845,541,977 stalled-cycles-backend:u # 32.69% backend cycles idle (75.13%) + 2,441,025,850 instructions:u # 0.94 insn per cycle + # 0.35 stalled cycles per insn (75.35%) + 0.918884236 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6938200) on address 0x14a208bdc000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14a49fa51372 in ??? +#1 0x14a49fa50505 in ??? +#2 0x14a49df46dbf in ??? +#3 0x14a49df46d2b in ??? +#4 0x14a49df483e4 in ??? +#5 0x14a496419b64 in ??? +#6 0x14a496416b38 in ??? +#7 0x14a4963d4496 in ??? +#8 0x14a49dee06e9 in ??? +#9 0x14a49e01449e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.155890e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.183617e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.183617e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.441481 sec - 4,347,023,152 cycles # 3.011 GHz - 12,588,009,166 instructions # 2.90 insn per cycle - 1.448735748 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.424910e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.455027e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.455027e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.170740 sec + 4,155,869,791 cycles:u # 3.488 GHz (74.64%) + 1,914,547 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.96%) + 509,366,618 stalled-cycles-backend:u # 12.26% backend cycles idle (75.16%) + 12,626,103,775 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (75.17%) + 1.193298382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.253491e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.483392e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.483392e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.524401 sec - 1,588,950,296 cycles # 3.004 GHz - 4,240,918,979 instructions # 2.67 insn per cycle - 0.538341289 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.961223e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.731117e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.731117e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.295877 sec - 848,904,121 cycles # 2.823 GHz - 1,913,779,976 instructions # 2.25 insn per cycle - 0.307372896 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.485658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.411446e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.411446e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.275232 sec - 779,943,899 cycles # 2.803 GHz - 1,795,594,447 instructions # 2.30 insn per cycle - 0.289346704 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.833560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.329755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.329755e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.362099 sec - 720,251,915 cycles # 1.963 GHz - 1,286,597,647 instructions # 1.79 insn per cycle - 0.375243448 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x644060) on address 0x152b34aec000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 864c5b4dac..1f1050ab1d 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_18:41:27 +DATE: 2024-01-28_13:16:49 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.679353e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.341805e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.729214e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.448080 sec - 1,940,047,979 cycles # 2.920 GHz - 2,742,937,870 instructions # 1.41 insn per cycle - 0.735393353 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 726,421,911 cycles:u # 2.167 GHz (76.90%) + 2,699,812 stalled-cycles-frontend:u # 0.37% frontend cycles idle (76.94%) + 39,906,193 stalled-cycles-backend:u # 5.49% backend cycles idle (74.90%) + 1,292,140,442 instructions:u # 1.78 insn per cycle + # 0.03 stalled cycles per insn (72.28%) + 0.356876792 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.226696e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.100972e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.538337e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.540123 sec - 2,295,280,774 cycles # 2.915 GHz - 3,248,742,569 instructions # 1.42 insn per cycle - 0.847439533 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 2,747,293,496 cycles:u # 2.868 GHz (72.77%) + 21,347,674 stalled-cycles-frontend:u # 0.78% frontend cycles idle (74.38%) + 843,785,312 stalled-cycles-backend:u # 30.71% backend cycles idle (76.08%) + 2,518,401,996 instructions:u # 0.92 insn per cycle + # 0.34 stalled cycles per insn (75.72%) + 0.977993834 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x693a290) on address 0x147ca1d69000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x147f38bda372 in ??? +#1 0x147f38bd9505 in ??? +#2 0x147f370cddbf in ??? +#3 0x147f370cdd2b in ??? +#4 0x147f370cf3e4 in ??? +#5 0x147f2f5a0b64 in ??? +#6 0x147f2f59db38 in ??? +#7 0x147f2f55b496 in ??? +#8 0x147f370676e9 in ??? +#9 0x147f3719b49e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.025332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.046638e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.046638e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.621376 sec - 4,906,088,168 cycles # 3.018 GHz - 13,824,588,217 instructions # 2.82 insn per cycle - 1.628687956 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.168410e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.187978e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.187978e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.425565 sec + 5,053,853,375 cycles:u # 3.490 GHz (74.65%) + 2,147,400 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.86%) + 857,993,584 stalled-cycles-backend:u # 16.98% backend cycles idle (75.11%) + 13,839,852,873 instructions:u # 2.74 insn per cycle + # 0.06 stalled cycles per insn (75.15%) + 1.450066232 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.920741e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.996876e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996876e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.875767 sec - 2,604,975,465 cycles # 2.959 GHz - 7,349,296,033 instructions # 2.82 insn per cycle - 0.890044499 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.212351e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.429946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.429946e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.533775 sec - 1,473,910,132 cycles # 2.737 GHz - 3,084,284,378 instructions # 2.09 insn per cycle - 0.547758581 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.852658e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155181e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.448502 sec - 1,285,781,151 cycles # 2.837 GHz - 2,873,225,261 instructions # 2.23 insn per cycle - 0.462355639 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.496967e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.624427e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.624427e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.681772 sec - 1,314,048,592 cycles # 1.914 GHz - 1,914,956,895 instructions # 1.46 insn per cycle - 0.694401401 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x667850) on address 0x1477c4599000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 4ce5d2d103..f8c7f13430 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,223 +1,110 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-27_18:41:45 +DATE: 2024-01-28_13:16:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.642077e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.178835e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.540093e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.449788 sec - 1,941,972,721 cycles # 2.918 GHz - 2,738,636,267 instructions # 1.41 insn per cycle - 0.738186725 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 735,921,519 cycles:u # 2.199 GHz (76.38%) + 2,583,381 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.85%) + 39,817,056 stalled-cycles-backend:u # 5.41% backend cycles idle (74.64%) + 1,284,923,774 instructions:u # 1.75 insn per cycle + # 0.03 stalled cycles per insn (72.51%) + 0.356777627 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.213581e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.983987e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.394706e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.539583 sec - 2,245,163,074 cycles # 2.866 GHz - 3,192,763,021 instructions # 1.42 insn per cycle - 0.840841647 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 2,678,401,129 cycles:u # 2.826 GHz (74.65%) + 21,071,117 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.83%) + 849,774,878 stalled-cycles-backend:u # 31.73% backend cycles idle (75.53%) + 2,561,641,144 instructions:u # 0.96 insn per cycle + # 0.33 stalled cycles per insn (75.48%) + 0.986518250 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x693a1e0) on address 0x154e15329000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x1551ac1af372 in ??? +#1 0x1551ac1ae505 in ??? +#2 0x1551aa6a2dbf in ??? +#3 0x1551aa6a2d2b in ??? +#4 0x1551aa6a43e4 in ??? +#5 0x1551a2b75b64 in ??? +#6 0x1551a2b72b38 in ??? +#7 0x1551a2b30496 in ??? +#8 0x1551aa63c6e9 in ??? +#9 0x1551aa77049e in ??? +#10 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.021933e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043806e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.043806e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.626260 sec - 4,912,245,932 cycles # 3.013 GHz - 13,831,784,678 instructions # 2.82 insn per cycle - 1.633425184 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.169455e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189040e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.189040e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.424073 sec + 5,045,556,451 cycles:u # 3.490 GHz (74.78%) + 2,489,730 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) + 792,720,433 stalled-cycles-backend:u # 15.71% backend cycles idle (75.11%) + 13,852,579,012 instructions:u # 2.75 insn per cycle + # 0.06 stalled cycles per insn (75.11%) + 1.447924472 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.952614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.030072e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.861728 sec - 2,610,889,602 cycles # 3.013 GHz - 7,352,459,897 instructions # 2.82 insn per cycle - 0.877694167 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.319451e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.546025e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.546025e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.516383 sec - 1,474,990,397 cycles # 2.829 GHz - 3,084,581,300 instructions # 2.09 insn per cycle - 0.532151264 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.830634e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.128730e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.128730e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.450724 sec - 1,286,646,934 cycles # 2.823 GHz - 2,874,817,797 instructions # 2.23 insn per cycle - 0.463104569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.498045e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.624903e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.624903e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.681753 sec - 1,315,133,016 cycles # 1.917 GHz - 1,915,783,936 instructions # 1.46 insn per cycle - 0.692521331 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x6664d0) on address 0x14ef36f79000. Reason: Unknown. From e8ea9a07aef5543b65cb1e06a153b4fb3cf89215 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 28 Jan 2024 17:47:59 +0200 Subject: [PATCH 73/96] [jt774] first execution of all 18 tmad tests on LUMI (on CPUs and AMD GPUs) - several failures in ggttq #806 STARTED AT Sun 28 Jan 2024 01:57:16 PM EET ENDED AT Sun 28 Jan 2024 05:33:56 PM EET Status=0 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 390 +++++---------- .../log_eemumu_mad_f_inl0_hrd0.txt | 420 ++++++----------- .../log_eemumu_mad_m_inl0_hrd0.txt | 394 +++++----------- .../log_ggtt_mad_d_inl0_hrd0.txt | 392 +++++----------- .../log_ggtt_mad_f_inl0_hrd0.txt | 416 ++++++---------- .../log_ggtt_mad_m_inl0_hrd0.txt | 390 +++++---------- .../log_ggttg_mad_d_inl0_hrd0.txt | 390 +++++---------- .../log_ggttg_mad_f_inl0_hrd0.txt | 416 ++++++---------- .../log_ggttg_mad_m_inl0_hrd0.txt | 390 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 388 +++++---------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 414 ++++++---------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 390 +++++---------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 392 +++++----------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 414 ++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 392 +++++----------- .../log_gqttq_mad_d_inl0_hrd0.txt | 419 ++++------------- .../log_gqttq_mad_f_inl0_hrd0.txt | 443 +++++------------- .../log_gqttq_mad_m_inl0_hrd0.txt | 417 ++++------------- 18 files changed, 2177 insertions(+), 5090 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 2c75ea3bc5..6e36ff4f89 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-27_19:44:10 +DATE: 2024-01-28_14:10:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6027s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4783s + [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1731s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1650s - [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1406s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1335s + [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4157s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3266s - [COUNTERS] Fortran MEs ( 1 ) : 0.0891s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3372s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2609s + [COUNTERS] Fortran MEs ( 1 ) : 0.0763s for 90112 events => throughput is 1.18E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1737s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0068s for 8192 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1478s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4118s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3360s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0758s for 90112 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3339s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2687s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0652s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120640e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.415290e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.164378e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.439208e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1740s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1700s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1436s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1401s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3753s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3312s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0441s for 90112 events => throughput is 2.04E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3044s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2659s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0385s for 90112 events => throughput is 2.34E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.983144e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.384930e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.061610e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.430126e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1752s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1405s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1381s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.34E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3325s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 90112 events => throughput is 2.73E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2972s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2702s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0271s for 90112 events => throughput is 3.33E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.615407e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.773647e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1685s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0027s for 8192 events => throughput is 3.02E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3653s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3344s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0309s for 90112 events => throughput is 2.92E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.802242e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.422146e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.082983e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1758s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0036s for 8192 events => throughput is 2.30E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3730s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3371s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0359s for 90112 events => throughput is 2.51E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.551254e+06 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.322806e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.501885e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,18 +357,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6076s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6070s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4130s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4126s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.02E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7799s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5450s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 2.02E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.938455e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.227145e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.994935e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.604349e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.746082e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.300423e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.494025e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.897629e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728254e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.338370e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.067943e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.949984e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.726308e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.235340e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.130699e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.550220e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 990ba27411..af7e96ea68 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-27_19:44:27 +DATE: 2024-01-28_14:10:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5988s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4578s + [COUNTERS] Fortran MEs ( 1 ) : 0.0072s for 8192 events => throughput is 1.15E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1733s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1651s - [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1413s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1342s + [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4192s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s - [COUNTERS] Fortran MEs ( 1 ) : 0.0892s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3373s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2610s + [COUNTERS] Fortran MEs ( 1 ) : 0.0764s for 90112 events => throughput is 1.18E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165804194712] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1785s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 8192 events => throughput is 1.28E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1458s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.59E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165492032638) differ by less than 4E-4 (1.6428111293542713e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165804194712) differ by less than 4E-4 (1.4992696539817274e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501906417650977E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4078s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0722s for 90112 events => throughput is 1.25E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3287s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0565s for 90112 events => throughput is 1.60E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905274264717E-002) differ by less than 4E-4 (1.5989335488963974e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501906417650977E-002) differ by less than 4E-4 (1.473975923538262e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.252721e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.646088e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.203917e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.666997e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747170102104579] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1716s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1691s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1404s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1383s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.90E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165570339780) differ by less than 4E-4 (1.6068031594151932e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747170102104579) differ by less than 4E-4 (4.77038091251103e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501924220365086E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3580s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 90112 events => throughput is 3.28E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2878s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2644s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263464411127e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501924220365086E-002) differ by less than 4E-4 (4.716350665567859e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.171921e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.048771e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.403675e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.232321e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747170107722075] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1689s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1370s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.66E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747170107722075) differ by less than 4E-4 (4.7962118499000894e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501924223714337E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3324s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 90112 events => throughput is 3.72E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2845s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2651s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 90112 events => throughput is 4.64E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501924223714337E-002) differ by less than 4E-4 (4.720010982062206e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720332e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.793683e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1696s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1675s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.00E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3526s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3301s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 90112 events => throughput is 3.99E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.903694e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.878508e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.107507e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1730s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1710s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.93E+06 events/s +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.048712e+06 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166446533123) differ by less than 4E-4 (1.2039032049049325e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3583s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3343s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0241s for 90112 events => throughput is 3.74E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501908990866423E-002) differ by less than 4E-4 (1.1927560927826875e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.576234e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.897428e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747166473699145] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5909s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.73E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4209s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.93E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166823487174) differ by less than 4E-4 (1.0305684361444634e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166473699145) differ by less than 4E-4 (1.1914114539379739e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501909133729520E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7560s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7514s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.99E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5440s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5410s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 3.01E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439961927435e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501909133729520E-002) differ by less than 4E-4 (1.1771429675455636e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.588173e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.778755e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.068280e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.253473e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.025338e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.336334e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.843391e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.699323e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.945293e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.325474e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.084064e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.852032e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.406384e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.106896e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.442610e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.950493e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index d8843cfcaf..cf5ccd68a6 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-27_19:44:44 +DATE: 2024-01-28_14:11:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6055s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5973s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4758s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4687s + [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1734s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1653s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1343s + [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4103s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3232s - [COUNTERS] Fortran MEs ( 1 ) : 0.0871s for 90112 events => throughput is 1.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3384s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2620s + [COUNTERS] Fortran MEs ( 1 ) : 0.0763s for 90112 events => throughput is 1.18E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1723s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 8192 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1475s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1415s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4114s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3347s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0768s for 90112 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0655s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.105584e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.418226e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.117429e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.426804e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1775s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1736s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0039s for 8192 events => throughput is 2.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1425s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1390s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.34E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3970s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0457s for 90112 events => throughput is 1.97E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3047s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2664s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0383s for 90112 events => throughput is 2.35E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.981868e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.503318e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.039596e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.509365e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1814s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1781s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1441s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1416s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.20E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3748s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3394s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0355s for 90112 events => throughput is 2.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2933s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2653s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0280s for 90112 events => throughput is 3.22E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.559591e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.712795e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1737s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.79E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3602s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0314s for 90112 events => throughput is 2.87E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.762723e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.355475e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.892375e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1717s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.52E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3687s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3338s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0349s for 90112 events => throughput is 2.58E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.423534e+06 ) sec^-1 -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.336676e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.591242e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,18 +357,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.5862s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5857s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 1.96E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919911173596E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7518s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7469s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5439s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.07E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173596E-002) differ by less than 2E-4 (6.950595654586778e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.180766e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.213267e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.997610e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.613214e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.732650e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.277391e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.502916e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.902459e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.713329e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.271988e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.957660e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.955276e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.723587e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.232579e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.165534e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.550864e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index e36af37e5f..da4f7b996b 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-27_19:45:01 +DATE: 2024-01-28_14:11:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3611s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3195s - [COUNTERS] Fortran MEs ( 1 ) : 0.0416s for 8192 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3530s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3174s + [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3129s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2721s - [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2591s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s + [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7588s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3044s - [COUNTERS] Fortran MEs ( 1 ) : 0.4544s for 90112 events => throughput is 1.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4005s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0114s + [COUNTERS] Fortran MEs ( 1 ) : 0.3891s for 90112 events => throughput is 2.32E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3453s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0363s for 8192 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2587s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7197s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3176s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4021s for 90112 events => throughput is 2.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3973s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0452s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3521s for 90112 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.232130e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.583416e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.251215e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.597741e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3154s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2942s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2630s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2453s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0177s for 8192 events => throughput is 4.63E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5256s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2303s for 90112 events => throughput is 3.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2297s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0339s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1958s for 90112 events => throughput is 4.60E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.814370e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.723096e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.785236e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.742482e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2969s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2471s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2369s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 7.99E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4419s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2958s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1460s for 90112 events => throughput is 6.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1362s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1123s for 90112 events => throughput is 8.02E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.858187e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.010499e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2943s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2828s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0115s for 8192 events => throughput is 7.12E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4134s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1243s for 90112 events => throughput is 7.25E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.053396e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.230738e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.009768e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3049s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4908s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2991s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1917s for 90112 events => throughput is 4.70E+05 events/s +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.316019e+05 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.480612e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.440487e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6959s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5082s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5076s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7123s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7059s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3011s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2934s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.071758e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.602602e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.732801e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.046178e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.007423e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.787613e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.069820e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.754824e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.015390e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.790261e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.148579e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.940647e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.007149e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.756503e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.022635e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.140255e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 7cae848e03..5e558ca142 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-27_19:45:27 +DATE: 2024-01-28_14:11:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3124s - [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2922s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2567s + [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3125s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2713s - [COUNTERS] Fortran MEs ( 1 ) : 0.0411s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2631s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2275s + [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.30E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7516s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3006s - [COUNTERS] Fortran MEs ( 1 ) : 0.4510s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4061s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0173s + [COUNTERS] Fortran MEs ( 1 ) : 0.3888s for 90112 events => throughput is 2.32E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690704859565443] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3394s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3051s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 8192 events => throughput is 2.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2568s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703999052587) differ by less than 4E-4 (8.971448917094449e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690704859565443) differ by less than 4E-4 (7.167087245907311e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780988783773] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7178s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3353s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3825s for 90112 events => throughput is 2.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3409s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0385s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3024s for 90112 events => throughput is 2.98E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223780103711483) differ by less than 4E-4 (4.733632297249102e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780988783773) differ by less than 4E-4 (2.818877065102754e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.325661e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.089416e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.297389e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.098377e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703261737937] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3049s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2907s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0142s for 8192 events => throughput is 5.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2530s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.60E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690699958440689) differ by less than 4E-4 (1.744398380187917e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703261737937) differ by less than 4E-4 (1.0517483095551228e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223779141681696] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4512s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2955s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1558s for 90112 events => throughput is 5.79E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1645s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0282s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1363s for 90112 events => throughput is 6.61E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223776162337749) differ by less than 4E-4 (1.326035499182865e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223779141681696) differ by less than 4E-4 (6.814876496452626e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.252620e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.730231e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.683567e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.517443e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690694815027769] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2780s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2387s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690694815027769) differ by less than 4E-4 (2.822892096743246e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223776468660162] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3745s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2887s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0859s for 90112 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.0930s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0245s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0685s for 90112 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223776468660162) differ by less than 4E-4 (1.2597660603574923e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.061022e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.028900e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2886s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2814s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0072s for 8192 events => throughput is 1.13E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3627s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2856s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0770s for 90112 events => throughput is 1.17E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.158067e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.369414e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.165706e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2918s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0097s for 8192 events => throughput is 8.41E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690698822141186) differ by less than 4E-4 (1.982662718447159e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.383290e+06 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4029s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2948s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1081s for 90112 events => throughput is 8.34E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223780266165058) differ by less than 4E-4 (4.382182106077437e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.810936e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.099694e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690697792016209] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6987s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.50E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5094s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5090s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.04E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697987) differ by less than 4E-4 (1.0232396008280631e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690697792016209) differ by less than 4E-4 (2.1986639087145932e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223779043453291] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7137s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7083s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 90112 events => throughput is 1.68E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3022s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2984s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 90112 events => throughput is 2.38E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376677454826e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223779043453291) differ by less than 4E-4 (7.027382697977202e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.269476e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.878193e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.034760e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.214469e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.781597e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.092154e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.784418e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.038194e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.842537e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.066187e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.885569e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.098451e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.378680e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.314044e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.411797e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.528445e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 2815e7f120..48535fd982 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-27_19:45:53 +DATE: 2024-01-28_14:12:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3539s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s - [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2926s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s + [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3146s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2734s - [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2602s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2247s + [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7482s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2971s - [COUNTERS] Fortran MEs ( 1 ) : 0.4511s for 90112 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4015s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0124s + [COUNTERS] Fortran MEs ( 1 ) : 0.3891s for 90112 events => throughput is 2.32E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3469s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3100s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0369s for 8192 events => throughput is 2.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2585s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7258s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4085s for 90112 events => throughput is 2.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4064s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0435s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3629s for 90112 events => throughput is 2.48E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.184678e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.545875e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.165824e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.552634e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3191s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2978s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2622s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5422s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3131s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2290s for 90112 events => throughput is 3.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2218s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0319s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1899s for 90112 events => throughput is 4.75E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.941267e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.798699e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.879138e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.790478e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3149s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3011s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0138s for 8192 events => throughput is 5.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2482s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2382s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.17E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4314s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1424s for 90112 events => throughput is 6.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1353s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0253s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1100s for 90112 events => throughput is 8.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.012934e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.079905e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2965s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2852s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.30E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4615s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3338s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1277s for 90112 events => throughput is 7.06E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.121805e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.461642e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.148011e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.506859e+05 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3039s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2870s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0170s for 8192 events => throughput is 4.83E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4901s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3023s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1879s for 90112 events => throughput is 4.80E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.564065e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.610241e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708266690713] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.6981s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6975s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.36E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5105s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5098s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690706) differ by less than 2E-4 (2.2875334959593374e-10) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690713) differ by less than 2E-4 (2.2875323857363128e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782303744798] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7068s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3005s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2929s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.18E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744798) differ by less than 2E-4 (2.5894508759449764e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.073063e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.621016e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.653827e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.051466e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.014949e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.842315e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.052887e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.799914e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.019887e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.799451e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.139444e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.032122e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.021175e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.783243e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.985737e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161657e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 850335f2f4..e6ba098a45 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-27_19:46:20 +DATE: 2024-01-28_14:12:24 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5540s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2283s - [COUNTERS] Fortran MEs ( 1 ) : 0.3257s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3051s + [COUNTERS] Fortran MEs ( 1 ) : 0.2704s for 8192 events => throughput is 3.03E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5631s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2312s - [COUNTERS] Fortran MEs ( 1 ) : 0.3319s for 8192 events => throughput is 2.47E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4621s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1916s + [COUNTERS] Fortran MEs ( 1 ) : 0.2705s for 8192 events => throughput is 3.03E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0352s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4496s - [COUNTERS] Fortran MEs ( 1 ) : 3.5856s for 90112 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.1239s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1526s + [COUNTERS] Fortran MEs ( 1 ) : 2.9713s for 90112 events => throughput is 3.03E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8598s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5399s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3199s for 8192 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7513s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4690s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2822s for 8192 events => throughput is 2.90E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3221s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7954s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5267s for 90112 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.5538s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4326s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.1212s for 90112 events => throughput is 2.89E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655597E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.617618e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.975541e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.613965e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.988809e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5563s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3905s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1658s for 8192 events => throughput is 4.94E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4640s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 8192 events => throughput is 6.04E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4634s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6410s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8224s for 90112 events => throughput is 4.94E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7692s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2857s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4836s for 90112 events => throughput is 6.07E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.996868e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.146640e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.053710e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.143050e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3922s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0834s for 8192 events => throughput is 9.82E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2594s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0672s for 8192 events => throughput is 1.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.5000s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5700s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9299s for 90112 events => throughput is 9.69E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.9501s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2143s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7358s for 90112 events => throughput is 1.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.422787e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.441130e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3732s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2998s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0733s for 8192 events => throughput is 1.12E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3727s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5593s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8134s for 90112 events => throughput is 1.11E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.138049e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.271439e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.127596e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4269s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3276s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0993s for 8192 events => throughput is 8.25E+04 events/s +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.275338e+05 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7063s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6040s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1023s for 90112 events => throughput is 8.17E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.583150e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.778299e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6772s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4985s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470764E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9316s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.5299s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4472s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0826s for 90112 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655610E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.631849e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.137973e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.320240e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.164776e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.650532e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.677550e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243044e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.306439e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.668471e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.677909e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.251938e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.842160e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.686782e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.664344e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.760801e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.408517e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index c136750f78..4aced905e9 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-27_19:47:02 +DATE: 2024-01-28_14:12:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5520s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2244s - [COUNTERS] Fortran MEs ( 1 ) : 0.3276s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4651s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1945s + [COUNTERS] Fortran MEs ( 1 ) : 0.2707s for 8192 events => throughput is 3.03E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5508s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2222s - [COUNTERS] Fortran MEs ( 1 ) : 0.3287s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4641s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1936s + [COUNTERS] Fortran MEs ( 1 ) : 0.2705s for 8192 events => throughput is 3.03E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0147s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4381s - [COUNTERS] Fortran MEs ( 1 ) : 3.5766s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.1307s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1595s + [COUNTERS] Fortran MEs ( 1 ) : 2.9712s for 90112 events => throughput is 3.03E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196349351077960E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5252s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3035s for 8192 events => throughput is 2.70E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7014s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4439s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2575s for 8192 events => throughput is 3.18E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347758884971E-002) differ by less than 4E-4 (1.0456755794585604e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349351077960E-002) differ by less than 4E-4 (8.818635788276907e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310859412953768E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0655s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7727s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.2929s for 90112 events => throughput is 2.74E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.2334s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4007s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.8326s for 90112 events => throughput is 3.18E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310858119443913E-002) differ by less than 4E-4 (1.7166476384833373e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310859412953768E-002) differ by less than 4E-4 (1.5575656098221202e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.841164e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.291524e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.820269e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.300074e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196335877214046E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4068s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3160s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0908s for 8192 events => throughput is 9.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3440s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2686s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 8192 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196323434217816E-002) differ by less than 4E-4 (3.548307125900152e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196335877214046E-002) differ by less than 4E-4 (2.2681155120718444e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310850963848921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6104s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5891s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0213s for 90112 events => throughput is 8.82E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0566s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2267s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8299s for 90112 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842598054087E-002) differ by less than 4E-4 (3.625542406293647e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310850963848921E-002) differ by less than 4E-4 (2.596676940136433e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.971196e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.105529e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.175070e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.107065e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196334589088509E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2611s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2269s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0342s for 8192 events => throughput is 2.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196334589088509E-002) differ by less than 4E-4 (2.400643681621517e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310850363433287E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0125s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5327s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4797s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5565s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1812s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 90112 events => throughput is 2.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310850363433287E-002) differ by less than 4E-4 (2.6705189259956796e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.933582e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.916894e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2634s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 8192 events => throughput is 2.16E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9418s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5246s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4172s for 90112 events => throughput is 2.16E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.157480e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.459238e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.150998e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3283s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2785s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0498s for 8192 events => throughput is 1.64E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344080460087E-002) differ by less than 4E-4 (1.4241285339888776e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.460557e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0744s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5309s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5435s for 90112 events => throughput is 1.66E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310857813116089E-002) differ by less than 4E-4 (1.754321300451167e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.655429e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.665602e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196347207304232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6500s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6492s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.77E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4804s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.18E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366366022E-002) differ by less than 4E-4 (8.802906736882221e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347207304232E-002) differ by less than 4E-4 (1.1024246959756567e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310859763686641E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9168s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 90112 events => throughput is 9.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4785s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4568s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 90112 events => throughput is 4.15E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310864949473954E-002) differ by less than 4E-4 (8.766578729613173e-08) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310859763686641E-002) differ by less than 4E-4 (1.5144308029846343e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.318884e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.767897e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.859848e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.558247e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.636314e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.470164e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.386714e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.087387e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.667329e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.452879e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.485862e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.637837e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.522474e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.425767e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.621873e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.262697e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b6c36d66b2..1f4f9c704d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-27_19:47:40 +DATE: 2024-01-28_14:13:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5608s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2342s - [COUNTERS] Fortran MEs ( 1 ) : 0.3266s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4648s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1944s + [COUNTERS] Fortran MEs ( 1 ) : 0.2704s for 8192 events => throughput is 3.03E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5478s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2222s - [COUNTERS] Fortran MEs ( 1 ) : 0.3256s for 8192 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4660s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1951s + [COUNTERS] Fortran MEs ( 1 ) : 0.2708s for 8192 events => throughput is 3.02E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0324s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4446s - [COUNTERS] Fortran MEs ( 1 ) : 3.5877s for 90112 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.1289s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1582s + [COUNTERS] Fortran MEs ( 1 ) : 2.9707s for 90112 events => throughput is 3.03E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8800s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5505s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3295s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7677s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4784s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2893s for 8192 events => throughput is 2.83E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.5826s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8660s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7166s for 90112 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.5962s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4302s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.1661s for 90112 events => throughput is 2.85E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872835011053E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.565163e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.921173e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.555528e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.914494e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5536s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3890s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1646s for 8192 events => throughput is 4.98E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4632s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3274s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 8192 events => throughput is 6.03E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4702s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6500s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8202s for 90112 events => throughput is 4.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7762s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2846s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4916s for 90112 events => throughput is 6.04E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872836789727E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.041058e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.188450e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.031890e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.198899e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4037s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3199s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0837s for 8192 events => throughput is 9.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3249s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0657s for 8192 events => throughput is 1.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.5175s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5787s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9387s for 90112 events => throughput is 9.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.9390s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7234s for 90112 events => throughput is 1.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.991486e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002523e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0720s for 8192 events => throughput is 1.14E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3486s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5473s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8012s for 90112 events => throughput is 1.12E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.143636e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.277475e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161564e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.279177e+05 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4314s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1025s for 8192 events => throughput is 7.99E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7050s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5842s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1208s for 90112 events => throughput is 8.04E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.096015e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.161407e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358102981259E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6648s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6594s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0053s for 8192 events => throughput is 1.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4978s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4903s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981231E-002) differ by less than 2E-4 (1.8571730819871846e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981259E-002) differ by less than 2E-4 (1.8571735260763944e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9297s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9069s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0227s for 90112 events => throughput is 3.96E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.5310s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4486s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 90112 events => throughput is 1.09E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634160E-002) differ by less than 2E-4 (1.109495828544027e-10) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634174E-002) differ by less than 2E-4 (1.1094947183210024e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.616391e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.136935e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.900480e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.155884e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.604902e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.677853e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.230519e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.303184e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.596050e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.678438e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240430e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.841214e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.614528e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.663129e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.710810e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.401211e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 460087d609..61ceb23aba 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-27_19:48:23 +DATE: 2024-01-28_14:14:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5619s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2948s - [COUNTERS] Fortran MEs ( 1 ) : 4.2671s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3091s + [COUNTERS] Fortran MEs ( 1 ) : 3.3665s for 8192 events => throughput is 2.43E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2929s - [COUNTERS] Fortran MEs ( 1 ) : 4.2755s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5983s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s + [COUNTERS] Fortran MEs ( 1 ) : 3.3663s for 8192 events => throughput is 2.43E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.4190s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0019s - [COUNTERS] Fortran MEs ( 1 ) : 46.4171s for 90112 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 38.5532s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5391s + [COUNTERS] Fortran MEs ( 1 ) : 37.0141s for 90112 events => throughput is 2.43E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.2836s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6935s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5901s for 8192 events => throughput is 1.78E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.6872s + [COUNTERS] Fortran Overhead ( 0 ) : 3.9109s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7763s for 8192 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 54.7612s - [COUNTERS] Fortran Overhead ( 0 ) : 6.3690s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.3922s for 90112 events => throughput is 1.86E+03 events/s + [COUNTERS] PROGRAM TOTAL : 46.7704s + [COUNTERS] Fortran Overhead ( 0 ) : 5.1883s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.5821s for 90112 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421150E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.924912e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.230275e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.925523e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.229678e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7140s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4722s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2419s for 8192 events => throughput is 3.65E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5098s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8498s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6599s for 8192 events => throughput is 4.94E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.9457s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1382s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8075s for 90112 events => throughput is 3.63E+03 events/s + [COUNTERS] PROGRAM TOTAL : 22.3971s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1665s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.2306s for 90112 events => throughput is 4.94E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421156E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.848056e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.069087e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.844500e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.072108e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2048s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2272s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9776s for 8192 events => throughput is 8.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.6321s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9441s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6880s for 8192 events => throughput is 1.19E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.6330s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8910s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7420s for 90112 events => throughput is 8.39E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.1120s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5133s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.5988s for 90112 events => throughput is 1.19E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.617417e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.595180e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.9765s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1174s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8591s for 8192 events => throughput is 9.54E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.2161s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7678s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4483s for 90112 events => throughput is 9.54E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.643052e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.214250e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.811387e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4115s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3312s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0803s for 8192 events => throughput is 7.58E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.8837s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0662s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.8175s for 90112 events => throughput is 7.63E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.218576e+04 ) sec^-1 -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.806363e+03 ) sec^-1 +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.813477e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8100s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7769s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 3.9935s + [COUNTERS] Fortran Overhead ( 0 ) : 3.8815s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1121s for 8192 events => throughput is 7.31E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7903s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4265s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 3.8764s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6410s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2354s for 90112 events => throughput is 7.29E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421166E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421164E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.291315e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.308550e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.527796e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.526993e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.105251e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.247957e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.152979e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.040930e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106581e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241118e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.161275e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.230304e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.111001e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.245366e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.431708e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.390859e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 89beafa1ac..b70116e765 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-27_19:52:40 +DATE: 2024-01-28_14:18:17 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4218s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2799s - [COUNTERS] Fortran MEs ( 1 ) : 4.1419s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.6085s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s + [COUNTERS] Fortran MEs ( 1 ) : 3.3745s for 8192 events => throughput is 2.43E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4317s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2764s - [COUNTERS] Fortran MEs ( 1 ) : 4.1552s for 8192 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.9463s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5818s + [COUNTERS] Fortran MEs ( 1 ) : 3.3644s for 8192 events => throughput is 2.43E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.8157s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9738s - [COUNTERS] Fortran MEs ( 1 ) : 45.8419s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 38.5378s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5009s + [COUNTERS] Fortran MEs ( 1 ) : 37.0369s for 90112 events => throughput is 2.43E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396734396344E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.1660s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1682s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.9978s for 8192 events => throughput is 2.05E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.9160s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5393s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3767s for 8192 events => throughput is 2.43E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396352122325E-004) differ by less than 4E-4 (3.2814141017745158e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396734396344E-004) differ by less than 4E-4 (3.2919516625984357e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803774245774590E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 49.9973s - [COUNTERS] Fortran Overhead ( 0 ) : 5.8319s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.1654s for 90112 events => throughput is 2.04E+03 events/s + [COUNTERS] PROGRAM TOTAL : 41.9342s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8227s + [COUNTERS] CudaCpp MEs ( 2 ) : 37.1115s for 90112 events => throughput is 2.43E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774048965294E-004) differ by less than 4E-4 (3.056275773571926e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774245774590E-004) differ by less than 4E-4 (3.0687291214803736e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.110751e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.484000e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.113663e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.486894e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277390210387336E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4883s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3686s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1197s for 8192 events => throughput is 7.32E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8879s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0502s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8377s for 8192 events => throughput is 9.78E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277387698033752E-004) differ by less than 4E-4 (3.0428601303089664e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277390210387336E-004) differ by less than 4E-4 (3.1121143240220306e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803772192716622E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.6256s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0669s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.5587s for 90112 events => throughput is 7.18E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.5533s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3160s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.2373s for 90112 events => throughput is 9.76E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803770691658365E-004) differ by less than 4E-4 (2.8438380874629132e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803772192716622E-004) differ by less than 4E-4 (2.9388193774071425e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.593109e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.002165e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.603205e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.003724e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277391351528001E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2497s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7566s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4931s for 8192 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5762s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3520s for 8192 events => throughput is 2.33E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277391351528001E-004) differ by less than 4E-4 (3.1435703964355355e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803774950753991E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.8293s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4062s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4231s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7143s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8410s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8733s for 90112 events => throughput is 2.33E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774950753991E-004) differ by less than 4E-4 (3.1133375519853956e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.703817e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.707084e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.1297s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4327s for 8192 events => throughput is 1.89E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.1100s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3460s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7640s for 90112 events => throughput is 1.89E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.947996e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.394756e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.958466e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.383836e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8024s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5268s for 8192 events => throughput is 1.56E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396133530942E-004) differ by less than 4E-4 (3.2753885288450135e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.2491s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4431s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.8060s for 90112 events => throughput is 1.55E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803777739454609E-004) differ by less than 4E-4 (3.2897959809652377e-06) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.565172e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.569351e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277395812950292E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7749s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0215s for 8192 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6895s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6341s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0553s for 8192 events => throughput is 1.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277400478491265E-004) differ by less than 4E-4 (3.395159378305479e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277395812950292E-004) differ by less than 4E-4 (3.266551574121479e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803778304590137E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.6309s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3945s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2365s for 90112 events => throughput is 3.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 3.1923s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5815s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6107s for 90112 events => throughput is 1.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.432211783227501e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803778304590137E-004) differ by less than 4E-4 (3.325555619992926e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.590982e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.476169e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.929944e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.004299e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.500320e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.720996e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.738271e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.318368e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.503432e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.714749e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.635839e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.074459e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.484746e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.709070e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.527720e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.434688e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 0666a67fd8..f1516a5257 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-27_19:55:58 +DATE: 2024-01-28_14:21:38 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.4689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2805s - [COUNTERS] Fortran MEs ( 1 ) : 4.1884s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2317s + [COUNTERS] Fortran MEs ( 1 ) : 3.3672s for 8192 events => throughput is 2.43E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.4168s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2810s - [COUNTERS] Fortran MEs ( 1 ) : 4.1358s for 8192 events => throughput is 1.98E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5984s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2322s + [COUNTERS] Fortran MEs ( 1 ) : 3.3662s for 8192 events => throughput is 2.43E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.6464s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9568s - [COUNTERS] Fortran MEs ( 1 ) : 45.6896s for 90112 events => throughput is 1.97E+03 events/s + [COUNTERS] PROGRAM TOTAL : 38.5218s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4972s + [COUNTERS] Fortran MEs ( 1 ) : 37.0246s for 90112 events => throughput is 2.43E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.1405s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7023s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4382s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.8050s + [COUNTERS] Fortran Overhead ( 0 ) : 3.9849s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8201s for 8192 events => throughput is 2.14E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 55.0635s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2258s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.8377s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.7126s + [COUNTERS] Fortran Overhead ( 0 ) : 9.5711s + [COUNTERS] CudaCpp MEs ( 2 ) : 42.1415s for 90112 events => throughput is 2.14E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725813026107E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.896262e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.188928e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.907010e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.187259e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.6732s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4394s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2338s for 8192 events => throughput is 3.67E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.3935s + [COUNTERS] Fortran Overhead ( 0 ) : 9.7620s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6315s for 8192 events => throughput is 5.02E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.6839s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1051s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.5787s for 90112 events => throughput is 3.67E+03 events/s + [COUNTERS] PROGRAM TOTAL : 26.6256s + [COUNTERS] Fortran Overhead ( 0 ) : 8.6133s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.0123s for 90112 events => throughput is 5.00E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725816246315E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.776392e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.138707e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.785912e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.171098e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.1949s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2253s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9695s for 8192 events => throughput is 8.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.6899s + [COUNTERS] Fortran Overhead ( 0 ) : 5.0088s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6811s for 8192 events => throughput is 1.20E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.6246s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8910s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.7336s for 90112 events => throughput is 8.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.1637s + [COUNTERS] Fortran Overhead ( 0 ) : 5.6687s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.4950s for 90112 events => throughput is 1.20E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.640527e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.678863e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.9438s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0984s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8454s for 8192 events => throughput is 9.69E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.1386s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7687s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.3699s for 90112 events => throughput is 9.62E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.922714e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.236682e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.865641e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.238440e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4280s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3589s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0691s for 8192 events => throughput is 7.66E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.7988s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9921s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.8068s for 90112 events => throughput is 7.63E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.748515e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.727003e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277293084696E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7780s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 2.2556s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1128s for 8192 events => throughput is 7.27E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084701E-004) differ by less than 2E-4 (5.03573627241849e-10) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084696E-004) differ by less than 2E-4 (5.035738492864539e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725738731031E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7891s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3624s for 90112 events => throughput is 2.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 5.3501s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1104s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2398s for 90112 events => throughput is 7.27E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131540830622839e-10) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731031E-004) differ by less than 2E-4 (6.131546381737962e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.302267e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.296213e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.526771e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.548555e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.095977e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.245057e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.154067e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.025106e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.104340e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.247798e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.167057e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.229717e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.099706e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.248252e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436803e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.380128e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 52fdbbde9d..bbe2d399e3 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-27_20:01:42 +DATE: 2024-01-28_14:29:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.6185s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4524s - [COUNTERS] Fortran MEs ( 1 ) : 95.1661s for 8192 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 73.0391s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4667s + [COUNTERS] Fortran MEs ( 1 ) : 72.5725s for 8192 events => throughput is 1.13E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.6420s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4530s - [COUNTERS] Fortran MEs ( 1 ) : 95.1890s for 8192 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 72.9463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3446s + [COUNTERS] Fortran MEs ( 1 ) : 72.6017s for 8192 events => throughput is 1.13E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1050.9211s - [COUNTERS] Fortran Overhead ( 0 ) : 4.0917s - [COUNTERS] Fortran MEs ( 1 ) : 1046.8295s for 90112 events => throughput is 8.61E+01 events/s + [COUNTERS] PROGRAM TOTAL : 800.6448s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1320s + [COUNTERS] Fortran MEs ( 1 ) : 797.5128s for 90112 events => throughput is 1.13E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 214.0886s - [COUNTERS] Fortran Overhead ( 0 ) : 96.2530s - [COUNTERS] CudaCpp MEs ( 2 ) : 117.8356s for 8192 events => throughput is 6.95E+01 events/s + [COUNTERS] PROGRAM TOTAL : 175.0060s + [COUNTERS] Fortran Overhead ( 0 ) : 80.0186s + [COUNTERS] CudaCpp MEs ( 2 ) : 94.9874s for 8192 events => throughput is 8.62E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1364.9336s - [COUNTERS] Fortran Overhead ( 0 ) : 99.9906s - [COUNTERS] CudaCpp MEs ( 2 ) : 1264.9430s for 90112 events => throughput is 7.12E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1135.2325s + [COUNTERS] Fortran Overhead ( 0 ) : 82.5281s + [COUNTERS] CudaCpp MEs ( 2 ) : 1052.7045s for 90112 events => throughput is 8.56E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813950E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.423695e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.031958e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.101960e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.030043e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 107.5716s - [COUNTERS] Fortran Overhead ( 0 ) : 49.8673s - [COUNTERS] CudaCpp MEs ( 2 ) : 57.7042s for 8192 events => throughput is 1.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 96.4180s + [COUNTERS] Fortran Overhead ( 0 ) : 51.6087s + [COUNTERS] CudaCpp MEs ( 2 ) : 44.8093s for 8192 events => throughput is 1.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 690.6820s - [COUNTERS] Fortran Overhead ( 0 ) : 53.5916s - [COUNTERS] CudaCpp MEs ( 2 ) : 637.0904s for 90112 events => throughput is 1.41E+02 events/s + [COUNTERS] PROGRAM TOTAL : 530.6261s + [COUNTERS] Fortran Overhead ( 0 ) : 39.2305s + [COUNTERS] CudaCpp MEs ( 2 ) : 491.3956s for 90112 events => throughput is 1.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.659541e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.249882e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.662982e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.266535e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.0765s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3145s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.7620s for 8192 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.7969s + [COUNTERS] Fortran Overhead ( 0 ) : 31.7782s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.0187s for 8192 events => throughput is 4.31E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 320.2875s - [COUNTERS] Fortran Overhead ( 0 ) : 27.1185s - [COUNTERS] CudaCpp MEs ( 2 ) : 293.1690s for 90112 events => throughput is 3.07E+02 events/s + [COUNTERS] PROGRAM TOTAL : 228.8024s + [COUNTERS] Fortran Overhead ( 0 ) : 18.4929s + [COUNTERS] CudaCpp MEs ( 2 ) : 210.3095s for 90112 events => throughput is 4.28E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.601129e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.617049e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 44.0775s - [COUNTERS] Fortran Overhead ( 0 ) : 20.2382s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.8393s for 8192 events => throughput is 3.44E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 288.1186s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9142s - [COUNTERS] CudaCpp MEs ( 2 ) : 264.2044s for 90112 events => throughput is 3.41E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.168880e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.297354e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.113271e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 45.3217s - [COUNTERS] Fortran Overhead ( 0 ) : 22.3591s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.9626s for 8192 events => throughput is 3.57E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.301798e+02 ) sec^-1 -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 280.5957s - [COUNTERS] Fortran Overhead ( 0 ) : 26.0901s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.5056s for 90112 events => throughput is 3.54E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.791116e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.767212e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435825E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 4.2015s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1184s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0831s for 8192 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.1780s + [COUNTERS] Fortran Overhead ( 0 ) : 7.3812s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7969s for 8192 events => throughput is 2.16E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (1.9984014443252818e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435825E-006) differ by less than 2E-14 (1.5543122344752192e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 18.6414s - [COUNTERS] Fortran Overhead ( 0 ) : 6.7249s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9165s for 90112 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.7876s + [COUNTERS] Fortran Overhead ( 0 ) : 10.0015s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.7860s for 90112 events => throughput is 2.16E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813960E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.531134e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.170313e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.257699e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.233689e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.280873e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.570446e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.583682e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.473042e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.232524e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.567032e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.452604e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.557123e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.262155e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.564602e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.246198e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.127454e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index fe3eae6140..8016b0a70e 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-27_21:27:23 +DATE: 2024-01-28_15:32:38 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.7877s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4488s - [COUNTERS] Fortran MEs ( 1 ) : 95.3389s for 8192 events => throughput is 8.59E+01 events/s + [COUNTERS] PROGRAM TOTAL : 72.8549s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s + [COUNTERS] Fortran MEs ( 1 ) : 72.5108s for 8192 events => throughput is 1.13E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 96.0064s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4553s - [COUNTERS] Fortran MEs ( 1 ) : 95.5511s for 8192 events => throughput is 8.57E+01 events/s + [COUNTERS] PROGRAM TOTAL : 121.1698s + [COUNTERS] Fortran Overhead ( 0 ) : 48.6282s + [COUNTERS] Fortran MEs ( 1 ) : 72.5416s for 8192 events => throughput is 1.13E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1055.8027s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1162s - [COUNTERS] Fortran MEs ( 1 ) : 1051.6865s for 90112 events => throughput is 8.57E+01 events/s + [COUNTERS] PROGRAM TOTAL : 810.5414s + [COUNTERS] Fortran Overhead ( 0 ) : 13.4189s + [COUNTERS] Fortran MEs ( 1 ) : 797.1225s for 90112 events => throughput is 1.13E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768412243468E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 192.0455s - [COUNTERS] Fortran Overhead ( 0 ) : 88.8202s - [COUNTERS] CudaCpp MEs ( 2 ) : 103.2254s for 8192 events => throughput is 7.94E+01 events/s + [COUNTERS] PROGRAM TOTAL : 162.3306s + [COUNTERS] Fortran Overhead ( 0 ) : 74.4438s + [COUNTERS] CudaCpp MEs ( 2 ) : 87.8867s for 8192 events => throughput is 9.32E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768374083672E-006) differ by less than 4E-4 (0.00014259935458071915) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768412243468E-006) differ by less than 4E-4 (0.00014260261802601093) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361436028353404E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1232.5237s - [COUNTERS] Fortran Overhead ( 0 ) : 92.7257s - [COUNTERS] CudaCpp MEs ( 2 ) : 1139.7980s for 90112 events => throughput is 7.91E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1056.0720s + [COUNTERS] Fortran Overhead ( 0 ) : 90.0693s + [COUNTERS] CudaCpp MEs ( 2 ) : 966.0027s for 90112 events => throughput is 9.33E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435710758843E-007) differ by less than 4E-4 (0.0001404387438554977) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361436028353404E-007) differ by less than 4E-4 (0.0001404536136035972) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.282100e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111130e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.305531e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111642e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694767325083535E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 49.0764s - [COUNTERS] Fortran Overhead ( 0 ) : 23.2911s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.7852s for 8192 events => throughput is 3.18E+02 events/s + [COUNTERS] PROGRAM TOTAL : 39.5912s + [COUNTERS] Fortran Overhead ( 0 ) : 18.2165s + [COUNTERS] CudaCpp MEs ( 2 ) : 21.3747s for 8192 events => throughput is 3.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694765360831655E-006) differ by less than 4E-4 (0.00014234165972015766) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694767325083535E-006) differ by less than 4E-4 (0.00014250964355011497) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361431788761647E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 312.6919s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8578s - [COUNTERS] CudaCpp MEs ( 2 ) : 285.8342s for 90112 events => throughput is 3.15E+02 events/s + [COUNTERS] PROGRAM TOTAL : 292.9265s + [COUNTERS] Fortran Overhead ( 0 ) : 55.9222s + [COUNTERS] CudaCpp MEs ( 2 ) : 237.0042s for 90112 events => throughput is 3.80E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429212586563E-007) differ by less than 4E-4 (0.00014013450003202976) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361431788761647E-007) differ by less than 4E-4 (0.00014025511631077237) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.601653e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.620389e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.596532e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.592869e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694766288507467E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 25.2888s - [COUNTERS] Fortran Overhead ( 0 ) : 11.8515s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.4373s for 8192 events => throughput is 6.10E+02 events/s + [COUNTERS] PROGRAM TOTAL : 23.5101s + [COUNTERS] Fortran Overhead ( 0 ) : 13.7778s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.7323s for 8192 events => throughput is 8.42E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694766288507467E-006) differ by less than 4E-4 (0.00014242099503225525) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361431260588202E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 162.4319s - [COUNTERS] Fortran Overhead ( 0 ) : 15.3978s - [COUNTERS] CudaCpp MEs ( 2 ) : 147.0342s for 90112 events => throughput is 6.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 116.7734s + [COUNTERS] Fortran Overhead ( 0 ) : 10.7208s + [COUNTERS] CudaCpp MEs ( 2 ) : 106.0525s for 90112 events => throughput is 8.50E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361431260588202E-007) differ by less than 4E-4 (0.00014023038727883907) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.206264e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.127912e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 22.4910s - [COUNTERS] Fortran Overhead ( 0 ) : 10.5292s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9618s for 8192 events => throughput is 6.85E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 145.3442s - [COUNTERS] Fortran Overhead ( 0 ) : 14.0492s - [COUNTERS] CudaCpp MEs ( 2 ) : 131.2950s for 90112 events => throughput is 6.86E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.195828e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.049493e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.228494e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.056012e+03 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 22.9016s - [COUNTERS] Fortran Overhead ( 0 ) : 11.4949s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.4068s for 8192 events => throughput is 7.18E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768276769753E-006) differ by less than 4E-4 (0.00014259103224434355) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 141.4330s - [COUNTERS] Fortran Overhead ( 0 ) : 14.9090s - [COUNTERS] CudaCpp MEs ( 2 ) : 126.5241s for 90112 events => throughput is 7.12E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435948756818E-007) differ by less than 4E-4 (0.00014044988689865257) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.594821e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.588228e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768512039880E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 2.4748s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9740s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5008s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 6.0902s + [COUNTERS] Fortran Overhead ( 0 ) : 4.2956s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.7947s for 8192 events => throughput is 4.56E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694770708194997E-006) differ by less than 4E-4 (0.00014279896898039546) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768512039880E-006) differ by less than 4E-4 (0.00014261115266633873) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361438292717214E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 10.9971s - [COUNTERS] Fortran Overhead ( 0 ) : 5.5646s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4326s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 26.7087s + [COUNTERS] Fortran Overhead ( 0 ) : 6.8261s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.8826s for 90112 events => throughput is 4.53E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361443477565656E-007) differ by less than 4E-4 (0.00014080238503022535) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361438292717214E-007) differ by less than 4E-4 (0.00014055963090697787) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.639284e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.539126e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.641835e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.545967e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.349279e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.382515e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.430829e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.535405e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.310161e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.397695e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.351327e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.072776e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.298097e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.409435e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.390837e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.095250e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index e6abf766e6..a5bfae67ec 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-27_22:32:05 +DATE: 2024-01-28_16:27:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 95.7215s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4454s - [COUNTERS] Fortran MEs ( 1 ) : 95.2761s for 8192 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 73.1885s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3462s + [COUNTERS] Fortran MEs ( 1 ) : 72.8423s for 8192 events => throughput is 1.12E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 95.7236s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4504s - [COUNTERS] Fortran MEs ( 1 ) : 95.2731s for 8192 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 74.1853s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6252s + [COUNTERS] Fortran MEs ( 1 ) : 72.5601s for 8192 events => throughput is 1.13E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1054.9291s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1216s - [COUNTERS] Fortran MEs ( 1 ) : 1050.8075s for 90112 events => throughput is 8.58E+01 events/s + [COUNTERS] PROGRAM TOTAL : 802.1686s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9649s + [COUNTERS] Fortran MEs ( 1 ) : 799.2037s for 90112 events => throughput is 1.13E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 210.1883s - [COUNTERS] Fortran Overhead ( 0 ) : 97.1779s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.0104s for 8192 events => throughput is 7.25E+01 events/s + [COUNTERS] PROGRAM TOTAL : 220.0110s + [COUNTERS] Fortran Overhead ( 0 ) : 83.6850s + [COUNTERS] CudaCpp MEs ( 2 ) : 136.3260s for 8192 events => throughput is 6.01E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1342.5497s - [COUNTERS] Fortran Overhead ( 0 ) : 100.9547s - [COUNTERS] CudaCpp MEs ( 2 ) : 1241.5950s for 90112 events => throughput is 7.26E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1142.6014s + [COUNTERS] Fortran Overhead ( 0 ) : 94.5505s + [COUNTERS] CudaCpp MEs ( 2 ) : 1048.0509s for 90112 events => throughput is 8.60E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436275882778E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.476387e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.027921e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.492210e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.024344e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 110.3661s - [COUNTERS] Fortran Overhead ( 0 ) : 51.0384s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.3277s for 8192 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 78.7418s + [COUNTERS] Fortran Overhead ( 0 ) : 35.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 43.4836s for 8192 events => throughput is 1.88E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 707.2110s - [COUNTERS] Fortran Overhead ( 0 ) : 55.1542s - [COUNTERS] CudaCpp MEs ( 2 ) : 652.0568s for 90112 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 521.0989s + [COUNTERS] Fortran Overhead ( 0 ) : 41.7682s + [COUNTERS] CudaCpp MEs ( 2 ) : 479.3307s for 90112 events => throughput is 1.88E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436284111587E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635229e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.353681e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.631046e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.355520e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 47.8962s - [COUNTERS] Fortran Overhead ( 0 ) : 22.3291s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.5671s for 8192 events => throughput is 3.20E+02 events/s + [COUNTERS] PROGRAM TOTAL : 34.2475s + [COUNTERS] Fortran Overhead ( 0 ) : 15.3222s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.9253s for 8192 events => throughput is 4.33E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 307.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 25.4999s - [COUNTERS] CudaCpp MEs ( 2 ) : 281.8685s for 90112 events => throughput is 3.20E+02 events/s + [COUNTERS] PROGRAM TOTAL : 240.9335s + [COUNTERS] Fortran Overhead ( 0 ) : 34.1423s + [COUNTERS] CudaCpp MEs ( 2 ) : 206.7912s for 90112 events => throughput is 4.36E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.835214e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.800587e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 42.0423s - [COUNTERS] Fortran Overhead ( 0 ) : 19.2394s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.8029s for 8192 events => throughput is 3.59E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 275.6535s - [COUNTERS] Fortran Overhead ( 0 ) : 22.9689s - [COUNTERS] CudaCpp MEs ( 2 ) : 252.6846s for 90112 events => throughput is 3.57E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.383037e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.593063e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.388150e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 44.1798s - [COUNTERS] Fortran Overhead ( 0 ) : 21.7720s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.4079s for 8192 events => throughput is 3.66E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.582873e+02 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 273.0007s - [COUNTERS] Fortran Overhead ( 0 ) : 25.3100s - [COUNTERS] CudaCpp MEs ( 2 ) : 247.6907s for 90112 events => throughput is 3.64E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.889496e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.892701e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100942770682E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 3.5395s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6766s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8629s for 8192 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 119.9683s + [COUNTERS] Fortran Overhead ( 0 ) : 115.7986s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1696s for 8192 events => throughput is 1.96E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.279223476620018e-10) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770682E-006) differ by less than 2E-4 (2.279226807289092e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436157495363E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 15.8536s - [COUNTERS] Fortran Overhead ( 0 ) : 6.3552s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4985s for 90112 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 56.4992s + [COUNTERS] Fortran Overhead ( 0 ) : 10.4671s + [COUNTERS] CudaCpp MEs ( 2 ) : 46.0321s for 90112 events => throughput is 1.96E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173717093105324e-11) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495363E-007) differ by less than 2E-4 (6.173750399796063e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.420712e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.983806e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.085781e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.003315e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110953e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.341441e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.160771e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.390860e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113092e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.328426e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.109796e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.240330e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.110480e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.338967e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.644429e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.083611e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 8eabbec827..7119205a5c 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-27_20:00:14 +DATE: 2024-01-28_14:28:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3143s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2439s - [COUNTERS] Fortran MEs ( 1 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3683s + [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3054s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2351s - [COUNTERS] Fortran MEs ( 1 ) : 0.0703s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2600s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1993s + [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2457s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4837s - [COUNTERS] Fortran MEs ( 1 ) : 0.7619s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8397s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1741s + [COUNTERS] Fortran MEs ( 1 ) : 0.6656s for 90112 events => throughput is 1.35E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3917s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0765s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3408s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4447s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5965s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8482s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0097s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2428s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7669s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.048805e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.199596e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.069775e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.199561e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3352s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2925s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2699s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2365s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0334s for 8192 events => throughput is 2.45E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9882s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5515s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4367s for 90112 events => throughput is 2.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5755s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2065s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3690s for 90112 events => throughput is 2.44E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.077139e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.473704e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.055483e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.482626e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2622s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0237s for 8192 events => throughput is 3.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2371s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2198s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7885s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5334s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2551s for 90112 events => throughput is 3.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3894s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1900s for 90112 events => throughput is 4.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.447216e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.838763e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.524839e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2619s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0206s for 8192 events => throughput is 3.97E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.851433e+05 ) sec^-1 -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7547s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2241s for 90112 events => throughput is 4.02E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.011517e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.937850e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0292s for 8192 events => throughput is 2.81E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8787s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5475s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3312s for 90112 events => throughput is 2.72E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.707317e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.724189e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6679s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9463s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9386s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.531327e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.025533e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.381686e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.530559e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.390744e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.773323e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.392670e+07 ) sec^-1 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.778464e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index e7ce883183..828e2e75d4 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-27_20:00:44 +DATE: 2024-01-28_14:29:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3058s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2368s - [COUNTERS] Fortran MEs ( 1 ) : 0.0690s for 8192 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2648s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2041s + [COUNTERS] Fortran MEs ( 1 ) : 0.0607s for 8192 events => throughput is 1.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3001s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2314s - [COUNTERS] Fortran MEs ( 1 ) : 0.0687s for 8192 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2760s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2153s + [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2330s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4705s - [COUNTERS] Fortran MEs ( 1 ) : 0.7624s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8359s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1714s + [COUNTERS] Fortran MEs ( 1 ) : 0.6644s for 90112 events => throughput is 1.36E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050315080224007] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3755s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3063s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0691s for 8192 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3172s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2605s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0567s for 8192 events => throughput is 1.44E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050314903825744) differ by less than 4E-4 (7.065505747139156e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050315080224007) differ by less than 4E-4 (6.997791349716564e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182183053122] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3365s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5781s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7584s for 90112 events => throughput is 1.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8506s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2301s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6205s for 90112 events => throughput is 1.45E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801181770186087) differ by less than 4E-4 (4.0292758352045155e-08) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182183053122) differ by less than 4E-4 (2.135493093469165e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.186203e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.481535e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.205535e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.484510e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050313441464579] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2887s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2652s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0235s for 8192 events => throughput is 3.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2412s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2217s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.21E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310835231938) differ by less than 4E-4 (8.627325996934943e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313441464579) differ by less than 4E-4 (7.626865614618339e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801180175915433] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7885s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5301s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2584s for 90112 events => throughput is 3.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4109s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1913s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2196s for 90112 events => throughput is 4.10E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177817838580) differ by less than 4E-4 (2.2158326773435988e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801180175915433) differ by less than 4E-4 (1.1342047256945875e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.410521e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.159063e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.312954e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.116683e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050313305934997] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2633s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2510s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2111s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.52E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313305934997) differ by less than 4E-4 (7.678891660312104e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801179852567595] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6598s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5246s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 90112 events => throughput is 6.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2865s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1811s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1055s for 90112 events => throughput is 8.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801179852567595) differ by less than 4E-4 (1.2825213757672316e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.502420e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.738486e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.518865e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2497s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0109s for 8192 events => throughput is 7.55E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6476s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5254s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1222s for 90112 events => throughput is 7.37E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.778124e+05 ) sec^-1 -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.140387e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.180727e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2742s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2583s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0159s for 8192 events => throughput is 5.15E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050317064561834) differ by less than 4E-4 (6.236059127973093e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7363s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5620s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1743s for 90112 events => throughput is 5.17E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674269399215e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.838710e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.732174e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6668s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6662s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.53E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050319131407651) differ by less than 4E-4 (5.442654378295941e-07) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0126s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0061s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.40E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.753799e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.525254e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.832384e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.703610e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.848857e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.802905e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.395978e+07 ) sec^-1 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.145333e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index ecf11d905f..d73dc7c80a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-27_20:01:12 +DATE: 2024-01-28_14:29:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2413s - [COUNTERS] Fortran MEs ( 1 ) : 0.0742s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2655s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2046s + [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3197s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2476s - [COUNTERS] Fortran MEs ( 1 ) : 0.0721s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2621s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2013s + [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2369s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4748s - [COUNTERS] Fortran MEs ( 1 ) : 0.7621s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8381s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1735s + [COUNTERS] Fortran MEs ( 1 ) : 0.6646s for 90112 events => throughput is 1.36E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3928s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3153s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0776s for 8192 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3436s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4489s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8546s for 90112 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0075s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2444s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7631s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608801) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.058266e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.196578e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.067039e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.195632e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3162s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2769s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0393s for 8192 events => throughput is 2.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0328s for 8192 events => throughput is 2.50E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0083s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5670s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4414s for 90112 events => throughput is 2.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2150s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3602s for 90112 events => throughput is 2.50E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608804) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.029172e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.505829e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.018217e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.512763e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2614s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2378s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2205s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.75E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7845s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5315s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2530s for 90112 events => throughput is 3.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3839s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1946s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1893s for 90112 events => throughput is 4.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,166 +332,18 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.609443e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.829459e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.566388e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.842597e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2776s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2578s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0199s for 8192 events => throughput is 4.12E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7504s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5324s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2180s for 90112 events => throughput is 4.13E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.031661e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.094627e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2987s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2688s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0299s for 8192 events => throughput is 2.74E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8739s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5413s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3326s for 90112 events => throughput is 2.71E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.587817e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.653156e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6688s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6681s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.19E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029699) differ by less than 2E-4 (3.329714282074292e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9436s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9359s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182637219937) differ by less than 2E-4 (5.227208665914418e-10) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.531745e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.050673e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.398873e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.543266e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.390552e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.836975e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.397476e+07 ) sec^-1 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.780810e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 From 55a4f762013735279e15a98653d2d624dbd36f2a Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 10:54:39 +0200 Subject: [PATCH 74/96] [jt774] in gq_ttq.mad cudacpp.mk, allow multi-word CXX (e.g. for the Cray compiler on LUMI #807) in HIP builds. Multi-word CXX are not allowed for CUDA because of the -ccbin option (see #505), but I had also disabled them on HIP and this is not necessary --- epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc From 68b589d49bcae9512d490089be7606e8a4a3f5b7 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 09:59:26 +0100 Subject: [PATCH 75/96] [jt774] in CODEGEN (backport gq_ttq.mad) cudacpp.mk, allow multi-word CXX (e.g. for the Cray compiler on LUMI #807) in HIP builds. Multi-word CXX are not allowed for CUDA because of the -ccbin option (see #505), but I had also disabled them on HIP and this is not necessary --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 8b5fc00a83..ff53904f90 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc From 707507f5914701354a4c6c6cf64f3e930c2de25e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 10:01:16 +0100 Subject: [PATCH 76/96] [jt774] manually fix cudacpp.mk in all processes for #807 and #505 for f in $(git ls-tree --name-only HEAD */SubProcesses/cudacpp.mk); do echo $f; \cp gq_ttq.mad/SubProcesses/cudacpp.mk $f; done --- epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk | 5 ++--- epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk | 5 ++--- 14 files changed, 28 insertions(+), 42 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index eefac8ff0d..d94c3e5e6e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -129,12 +129,11 @@ endif # While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. # This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). -# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA and HIP builds (issue #505) +# If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) # This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside - $(warning CUDA and HIP builds are not supported for multi-word CXX "$(CXX)") + $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled - override HIP_HOME=disabled endif # If CUDA_HOME is not set, try to set it from the path to nvcc From 67a18a598d23e7256cf3b7f73ab2a9e2dc69a368 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 11:48:54 +0200 Subject: [PATCH 77/96] [jt744] in gq_ttq.mad, remove -G from HIP debug builds on LUMI (#808) Note: to trigger debug builds, use "make -f cudacpp.mk debug -j; make -j" --- epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) From 6572c5d0afe9c26675e78876c4e68cab5206b49e Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 10:51:53 +0100 Subject: [PATCH 78/96] [jt744] in CODEGEN (backport gq_ttq.mad), remove -G from HIP debug builds on LUMI (#808) Note: to trigger debug builds, use "make -f cudacpp.mk debug -j; make -j" --- .../madgraph/iolibs/template_files/gpu/cudacpp.mk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index ff53904f90..4900a659b2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) From 05f5ecf80c89c9dc38140e6f6053fa428c66a412 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 10:52:46 +0100 Subject: [PATCH 79/96] [jt774] regenerate gq_ttq.mad to check all is ok - actually Jorgen's name in COPYRIGHT has also changed --- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 24 +++++++++---------- epochX/cudacpp/gq_ttq.mad/COPYRIGHT | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 2a21d715bb..14f284d218 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005668163299560547  +DEBUG: model prefixing takes 0.005811214447021484  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.079 s +8 processes with 40 diagrams generated in 0.083 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -Wrote files for 32 helas calls in 0.229 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s +Wrote files for 32 helas calls in 0.321 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.150 s +ALOHA: aloha creates 2 routines in 0.155 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.136 s +ALOHA: aloha creates 4 routines in 0.141 s FFV1 FFV1 FFV1 @@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.985s -user 0m1.757s -sys 0m0.224s -Code generation completed in 2 seconds +real 0m2.865s +user 0m1.808s +sys 0m0.282s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full From fcb359a50c44f85d1634f0679cb2f427525c1adb Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Mon, 29 Jan 2024 10:54:54 +0100 Subject: [PATCH 80/96] [jt774] manually fix all processes for cudacpp '-G' option in HIP #808 (and also for Jorgen's name in copyright) for f in $(git ls-tree --name-only HEAD */SubProcesses/cudacpp.mk); do echo $f; \cp gq_ttq.mad/SubProcesses/cudacpp.mk $f; done for f in $(git ls-tree --name-only HEAD */COPYRIGHT); do echo $f; \cp gq_ttq.mad/COPYRIGHT $f; done --- epochX/cudacpp/ee_mumu.mad/COPYRIGHT | 2 +- epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/ee_mumu.sa/COPYRIGHT | 2 +- epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_tt.mad/COPYRIGHT | 2 +- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_tt.sa/COPYRIGHT | 2 +- epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_tt01g.mad/COPYRIGHT | 2 +- epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_ttg.mad/COPYRIGHT | 2 +- epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_ttg.sa/COPYRIGHT | 2 +- epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_ttgg.mad/COPYRIGHT | 2 +- epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_ttgg.sa/COPYRIGHT | 2 +- epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_ttggg.mad/COPYRIGHT | 2 +- epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gg_ttggg.sa/COPYRIGHT | 2 +- epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/gq_ttq.sa/COPYRIGHT | 2 +- epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/heft_gg_h.sa/COPYRIGHT | 2 +- epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk | 3 ++- epochX/cudacpp/pp_tt012j.mad/COPYRIGHT | 2 +- epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk | 3 ++- 28 files changed, 42 insertions(+), 28 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT +++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT index 84a883fbb0..9036d9260a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT +++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT @@ -15,7 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) - Joergen Teig (CERN) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index d94c3e5e6e..117edc1782 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -218,7 +218,8 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) HIPINC = -I$(HIP_HOME)/include/ # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) - GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) From 9ff021b319d14bdb46ef124d6dae1bfaa5f42cb8 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:13:44 +0200 Subject: [PATCH 81/96] [jt774] in gg_tt.mad, fix mgOnGpuCxtypes.h so that cucomplex and thrust are only used for cuda and not for hip (#810) --- epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..2f4a80fb85 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -31,7 +31,7 @@ #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +222,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -307,7 +307,7 @@ namespace mg5amcCpu //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +343,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +562,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +610,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== From 5296b4089014f27d40a07acbfabe081cb303e2d1 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:52:58 +0200 Subject: [PATCH 82/96] [jt774] in gg_tt.mad, fix complex type #ifdef's and fix "-x hip" compiler options for HIP (#810) --- .../gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc | 12 ++++++++---- epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk | 6 +++--- epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk | 4 ++-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h | 7 +++++-- epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h | 9 +++++++-- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index aab490dc5b..6d0aea1166 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index 2f4a80fb85..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -30,6 +30,11 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else // Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,7 +308,7 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== From 35a70d3f2c31af95869f610968d6addbe442d585 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:00:21 +0100 Subject: [PATCH 83/96] [jt774] in CODEGEN (backport from gg_tt.mad), fix mgOnGpuCxtypes.h so that cucomplex and thrust are only used for cuda and not for hip (#810) and fix complex type #ifdef's and fix "-x hip" compiler options for HIP (#810) Also add minor clang formatting fixes --- .../iolibs/template_files/gpu/check_sa.cc | 14 ++++++---- .../iolibs/template_files/gpu/cudacpp.mk | 6 ++--- .../iolibs/template_files/gpu/cudacpp_src.mk | 8 +++--- .../iolibs/template_files/gpu/mgOnGpuConfig.h | 7 +++-- .../template_files/gpu/mgOnGpuCxtypes.h | 27 +++++++++++-------- 5 files changed, 37 insertions(+), 25 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index 4900a659b2..c570d2418c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index 2c084615d9..3d7ffb7db5 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") @@ -264,9 +264,9 @@ $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o) +cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) ifneq ($(GPUCC),) -cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o) +cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 989b3f0eea..05013cf981 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== From d7fd274fb1f8410404aab5839c716bd4f289664d Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:02:35 +0100 Subject: [PATCH 84/96] [jt774] regenerate gg_tt.mad, all ok (including code formatting fixes) --- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 20 +++++++++---------- .../SubProcesses/P1_gg_ttx/check_sa.cc | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 9e8e783b82..111d57034e 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057849884033203125  +DEBUG: model prefixing takes 0.005807161331176758  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.104 s +Wrote files for 10 helas calls in 0.108 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.148 s +ALOHA: aloha creates 2 routines in 0.156 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.140 s VVV1 FFV1 FFV1 @@ -237,10 +237,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.738s -user 0m1.518s -sys 0m0.218s -Code generation completed in 2 seconds +real 0m1.803s +user 0m1.564s +sys 0m0.229s +Code generation completed in 1 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 6d0aea1166..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD From b28dff097f32dae5b7eeecd11ebfc8a2fe3597d6 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:07:03 +0100 Subject: [PATCH 85/96] [jt774] regenerate all processes --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 16 ++--- .../SubProcesses/P1_epem_mupmum/check_sa.cc | 14 +++-- .../ee_mumu.mad/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk | 4 +- .../cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h | 27 +++++---- .../CODEGEN_cudacpp_ee_mumu_log.txt | 10 ++-- .../P1_Sigma_sm_epem_mupmum/check_sa.cc | 14 +++-- .../ee_mumu.sa/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk | 4 +- epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 16 ++--- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 ++-- .../P1_Sigma_sm_gg_ttx/check_sa.cc | 14 +++-- .../cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk | 4 +- epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h | 7 ++- epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 22 +++---- .../SubProcesses/P1_gg_ttx/check_sa.cc | 14 +++-- .../SubProcesses/P2_gg_ttxg/check_sa.cc | 14 +++-- .../gg_tt01g.mad/SubProcesses/cudacpp.mk | 6 +- .../cudacpp/gg_tt01g.mad/src/cudacpp_src.mk | 4 +- .../cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h | 27 +++++---- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 18 +++--- .../SubProcesses/P1_gg_ttxg/check_sa.cc | 14 +++-- .../gg_ttg.mad/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk | 4 +- epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h | 27 +++++---- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 14 ++--- .../P1_Sigma_sm_gg_ttxg/check_sa.cc | 14 +++-- .../cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk | 4 +- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h | 7 ++- epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 20 +++---- .../SubProcesses/P1_gg_ttxgg/check_sa.cc | 14 +++-- .../gg_ttgg.mad/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk | 4 +- .../cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h | 27 +++++---- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 14 ++--- .../P1_Sigma_sm_gg_ttxgg/check_sa.cc | 14 +++-- .../gg_ttgg.sa/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk | 4 +- epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 +++---- .../SubProcesses/P1_gg_ttxggg/check_sa.cc | 14 +++-- .../gg_ttggg.mad/SubProcesses/cudacpp.mk | 6 +- .../cudacpp/gg_ttggg.mad/src/cudacpp_src.mk | 4 +- .../cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h | 27 +++++---- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 14 ++--- .../P1_Sigma_sm_gg_ttxggg/check_sa.cc | 14 +++-- .../gg_ttggg.sa/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk | 4 +- .../cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 18 +++--- .../SubProcesses/P1_gu_ttxu/check_sa.cc | 14 +++-- .../SubProcesses/P1_gux_ttxux/check_sa.cc | 14 +++-- .../gq_ttq.mad/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk | 4 +- epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h | 27 +++++---- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 16 ++--- .../P1_Sigma_sm_gu_ttxu/check_sa.cc | 14 +++-- .../P1_Sigma_sm_gux_ttxux/check_sa.cc | 14 +++-- .../cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk | 6 +- epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk | 4 +- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h | 7 ++- epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 8 +-- .../P1_Sigma_heft_gg_h/check_sa.cc | 14 +++-- .../heft_gg_h.sa/SubProcesses/cudacpp.mk | 6 +- .../cudacpp/heft_gg_h.sa/src/cudacpp_src.mk | 8 +-- .../cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h | 7 ++- .../cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h | 27 +++++---- .../CODEGEN_mad_pp_tt012j_log.txt | 60 +++++++++---------- .../SubProcesses/P0_gg_ttx/check_sa.cc | 14 +++-- .../SubProcesses/P0_uux_ttx/check_sa.cc | 14 +++-- .../SubProcesses/P1_gg_ttxg/check_sa.cc | 14 +++-- .../SubProcesses/P1_gu_ttxu/check_sa.cc | 14 +++-- .../SubProcesses/P1_gux_ttxux/check_sa.cc | 14 +++-- .../SubProcesses/P1_uux_ttxg/check_sa.cc | 14 +++-- .../SubProcesses/P2_gg_ttxgg/check_sa.cc | 14 +++-- .../SubProcesses/P2_gg_ttxuux/check_sa.cc | 14 +++-- .../SubProcesses/P2_gu_ttxgu/check_sa.cc | 14 +++-- .../SubProcesses/P2_gux_ttxgux/check_sa.cc | 14 +++-- .../SubProcesses/P2_uc_ttxuc/check_sa.cc | 14 +++-- .../SubProcesses/P2_ucx_ttxucx/check_sa.cc | 14 +++-- .../SubProcesses/P2_uu_ttxuu/check_sa.cc | 14 +++-- .../SubProcesses/P2_uux_ttxccx/check_sa.cc | 14 +++-- .../SubProcesses/P2_uux_ttxgg/check_sa.cc | 14 +++-- .../SubProcesses/P2_uux_ttxuux/check_sa.cc | 14 +++-- .../SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc | 14 +++-- .../SubProcesses/P2_uxux_ttxuxux/check_sa.cc | 14 +++-- .../pp_tt012j.mad/SubProcesses/cudacpp.mk | 6 +- .../cudacpp/pp_tt012j.mad/src/cudacpp_src.mk | 4 +- .../cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h | 7 ++- .../pp_tt012j.mad/src/mgOnGpuCxtypes.h | 27 +++++---- 105 files changed, 812 insertions(+), 564 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index b9e8f6df36..b79a051c06 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005430698394775391  +DEBUG: model prefixing takes 0.00582122802734375  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.101 s +Wrote files for 8 helas calls in 0.104 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.205 s +ALOHA: aloha creates 3 routines in 0.214 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.260 s +ALOHA: aloha creates 7 routines in 0.272 s FFV1 FFV1 FFV2 @@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.923s -user 0m1.688s -sys 0m0.227s +real 0m1.986s +user 0m1.756s +sys 0m0.219s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index e9dbec802c..fe21d36197 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057604312896728516  +DEBUG: model prefixing takes 0.005848884582519531  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.277 s +ALOHA: aloha creates 4 routines in 0.282 s FFV1 FFV1 FFV2 @@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.683s -user 0m0.604s -sys 0m0.071s +real 0m0.695s +user 0m0.628s +sys 0m0.057s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 111d57034e..83e06dd090 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005807161331176758  +DEBUG: model prefixing takes 0.0058765411376953125  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -195,12 +195,12 @@ Wrote files for 10 helas calls in 0.108 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.156 s +ALOHA: aloha creates 2 routines in 0.154 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.140 s +ALOHA: aloha creates 4 routines in 0.139 s VVV1 FFV1 FFV1 @@ -237,10 +237,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.803s -user 0m1.564s -sys 0m0.229s -Code generation completed in 1 seconds +real 0m1.802s +user 0m1.568s +sys 0m0.224s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 5091b00a3e..443c1e7506 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005614757537841797  +DEBUG: model prefixing takes 0.005812883377075195  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.008 s +1 processes with 3 diagrams generated in 0.009 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.147 s +ALOHA: aloha creates 2 routines in 0.152 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.552s -user 0m0.489s -sys 0m0.054s +real 0m0.561s +user 0m0.502s +sys 0m0.055s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index 8042cf580a..c37391aef7 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0056362152099609375  +DEBUG: model prefixing takes 0.005814790725708008  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.021 s Total: 2 processes with 19 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s -Wrote files for 46 helas calls in 0.250 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s +Wrote files for 46 helas calls in 0.257 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.334 s +ALOHA: aloha creates 5 routines in 0.346 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.329 s VVV1 VVV1 FFV1 @@ -283,9 +283,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.353s -user 0m2.114s -sys 0m0.226s +real 0m2.425s +user 0m2.147s +sys 0m0.279s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 1dbabb94a0..7df8225f38 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005898475646972656  +DEBUG: model prefixing takes 0.005897045135498047  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,14 +191,14 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.158 s +Wrote files for 36 helas calls in 0.157 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.348 s +ALOHA: aloha creates 5 routines in 0.344 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.329 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -252,10 +252,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m4.472s -user 0m2.033s -sys 0m0.236s -Code generation completed in 5 seconds +real 0m2.290s +user 0m2.041s +sys 0m0.249s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 81fcb8c8ed..74be077c7e 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005418062210083008  +DEBUG: model prefixing takes 0.005811929702758789  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.023 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.335 s +ALOHA: aloha creates 5 routines in 0.346 s VVV1 VVV1 FFV1 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.807s -user 0m0.747s -sys 0m0.049s +real 0m0.831s +user 0m0.769s +sys 0m0.052s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index a9bcc2504b..8574f56894 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005433797836303711  +DEBUG: model prefixing takes 0.005876302719116211  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.161 s +1 processes with 123 diagrams generated in 0.169 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.446 s -Wrote files for 222 helas calls in 0.719 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.451 s +Wrote files for 222 helas calls in 0.735 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.347 s +ALOHA: aloha creates 5 routines in 0.359 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.338 s VVV1 VVV1 FFV1 @@ -255,9 +255,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.429s -user 0m3.139s -sys 0m0.247s +real 0m3.480s +user 0m3.191s +sys 0m0.282s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index f908e4a331..dcf971696c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005699872970581055  +DEBUG: model prefixing takes 0.005830526351928711  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.160 s +1 processes with 123 diagrams generated in 0.168 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.452 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.351 s +ALOHA: aloha creates 5 routines in 0.340 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.496s -user 0m1.397s -sys 0m0.068s +real 0m1.536s +user 0m1.455s +sys 0m0.066s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index cdbfdbd3d7..bc4cb5e760 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005588531494140625  +DEBUG: model prefixing takes 0.005844831466674805  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.900 s +1 processes with 1240 diagrams generated in 2.003 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.686 s -Wrote files for 2281 helas calls in 18.830 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 7.023 s +Wrote files for 2281 helas calls in 19.759 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.329 s +ALOHA: aloha creates 5 routines in 0.339 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.323 s +ALOHA: aloha creates 10 routines in 0.331 s VVV1 VVV1 FFV1 @@ -257,10 +257,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.715s -user 0m29.204s -sys 0m0.406s -Code generation completed in 30 seconds +real 0m31.161s +user 0m30.606s +sys 0m0.431s +Code generation completed in 31 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 31cde146a9..0ee2c63a79 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054972171783447266  +DEBUG: model prefixing takes 0.005843639373779297  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.905 s +1 processes with 1240 diagrams generated in 1.996 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.694 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 7.005 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 1.198 s +ALOHA: aloha creates 5 routines in 0.376 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m14.110s -user 0m13.103s -sys 0m0.119s +real 0m13.898s +user 0m13.688s +sys 0m0.137s Code generation completed in 14 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 14f284d218..59ac167900 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005811214447021484  +DEBUG: model prefixing takes 0.00553131103515625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -231,11 +231,11 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s -Wrote files for 32 helas calls in 0.321 s +Wrote files for 32 helas calls in 0.241 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.155 s +ALOHA: aloha creates 2 routines in 0.156 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines @@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.865s -user 0m1.808s -sys 0m0.282s -Code generation completed in 3 seconds +real 0m2.049s +user 0m1.786s +sys 0m0.254s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index c91b123988..5ad4ee5be1 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005486726760864258  +DEBUG: model prefixing takes 0.00584101676940918  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.083 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -206,12 +206,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.148 s +ALOHA: aloha creates 2 routines in 0.157 s FFV1 FFV1 FFV1 @@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.784s -user 0m0.611s -sys 0m0.049s -Code generation completed in 0 seconds +real 0m0.692s +user 0m0.624s +sys 0m0.058s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index db94d58de4..a6ac85eeef 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -153,7 +153,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.063 s +ALOHA: aloha creates 1 routines in 0.065 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.441s -user 0m0.375s -sys 0m0.052s +real 0m0.454s +user 0m0.374s +sys 0m0.068s Code generation completed in 0 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk index 998d3c84fa..b2b9da5288 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") @@ -264,9 +264,9 @@ $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o) +cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) ifneq ($(GPUCC),) -cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o) +cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h index 06787c1c5e..475749ca7c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index 15bcd183c5..ca1a8be0ce 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005548954010009766  +DEBUG: model prefixing takes 0.005836963653564453  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.031 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.140 s +13 processes with 76 diagrams generated in 0.145 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.869 s +65 processes with 1119 diagrams generated in 1.949 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.313 s -Wrote files for 810 helas calls in 3.363 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.379 s +Wrote files for 810 helas calls in 3.511 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.361 s +ALOHA: aloha creates 5 routines in 0.358 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.330 s +ALOHA: aloha creates 10 routines in 0.332 s VVV1 VVV1 FFV1 @@ -1028,10 +1028,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.143s -user 0m8.594s -sys 0m0.500s -Code generation completed in 10 seconds +real 0m9.505s +user 0m8.862s +sys 0m0.556s +Code generation completed in 9 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc index aab490dc5b..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc @@ -789,7 +789,7 @@ main( int argc, char** argv ) #else wrkflwtxt += "???+"; // no path to this statement #endif /* clang-format on */ - // -- CUCOMPLEX or THRUST or STD complex numbers? + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -801,7 +801,7 @@ main( int argc, char** argv ) wrkflwtxt += "???:"; // no path to this statement #endif #elif defined __HIPCC__ -#if defined MGONGPU_CUCXTYPE_CXSMPL +#if defined MGONGPU_HIPCXTYPE_CXSMPL wrkflwtxt += "CXS:"; #else wrkflwtxt += "???:"; // no path to this statement @@ -863,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -940,7 +940,9 @@ main( int argc, char** argv ) << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl #else << "Complex type = ???" << std::endl // no path to this statement... @@ -1081,7 +1083,9 @@ main( int argc, char** argv ) << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#elif defined MGONGPU_CUCXTYPE_CXSMPL +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl #else << "\"???\"," << std::endl // no path to this statement... diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 117edc1782..df74dfc284 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -169,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -224,7 +224,7 @@ else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) GPUFLAGS += -std=c++17 ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) CUBUILDRULEFLAGS = -fPIC -c - CCBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip else ifneq ($(origin REQUIRE_HIP),undefined) @@ -556,6 +556,7 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi @@ -565,7 +566,6 @@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif -# -x cu in line above # Generic target and build rules: objects from C++ compilation # (NB do not include CUINC here! add it only for NVTX or curand #679) diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk index 159e19a46d..b2b9da5288 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk @@ -92,11 +92,11 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) -# Add correct -DHIP_LATFORM when compiling for HIP +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) ifeq ($(findstring nvcc,$(GPUCC)),nvcc) GPUFLAGS += -Xcompiler -fPIC -c -x cu else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) - GPUFLAGS += -fPIC -c + GPUFLAGS += -fPIC -c -x hip endif # Set the build flags appropriate to each AVX choice (example: "make AVX=none") diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 69cee0085b..6bde4466d0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -70,16 +70,19 @@ ////#define MGONGPU_HARDCODE_PARAM 1 // Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) -// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE) +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) #elif defined __HIPCC__ -#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP // Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) #else //#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) #define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h index 5532e22fa1..7ede1dbfae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef MGONGPUCPP_GPUIMPL +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -222,7 +227,7 @@ namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef MGONGPUCPP_GPUIMPL // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -261,7 +266,7 @@ namespace mg5amcGpu namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -343,11 +348,11 @@ namespace mg5amcCpu return c; } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_THRUST +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST //========================================================================== -#if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -562,11 +567,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#endif // #if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX //========================================================================== -#if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -610,7 +615,7 @@ namespace mg5amcCpu } #endif -#endif // #if not defined MGONGPUCPP_GPUIMPL and defined MGONGPU_CPPCXTYPE_STDCOMPLEX +#endif // #if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX //========================================================================== From 7c4823d9e92a5b0fd7e1b89eeba86bb04cda4948 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:15:05 +0100 Subject: [PATCH 86/96] [jt774] in tput scripts, add -rorhst and -hip flags --- epochX/cudacpp/tput/allTees.sh | 11 ++++++++--- epochX/cudacpp/tput/teeThroughputX.sh | 2 ++ epochX/cudacpp/tput/throughputX.sh | 3 +++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh index 4d1599e547..9be4f5d4fc 100755 --- a/epochX/cudacpp/tput/allTees.sh +++ b/epochX/cudacpp/tput/allTees.sh @@ -12,6 +12,7 @@ suff=".mad" # Parse command line arguments ggttggg=-ggttggg +rndhst=-curhst while [ "$1" != "" ]; do if [ "$1" == "-short" ]; then # Short (no ggttggg) or long version? @@ -30,8 +31,12 @@ while [ "$1" != "" ]; do # Only build all tests instead of building and running them? opts+=" -makeonly" shift + elif [ "$1" == "-hip" ]; then + # Random numbers use rocrand instead of curand? + rndhst=-rorhst + shift else - echo "Usage: $0 [-short] [-e] [-sa] [-makeonly]" + echo "Usage: $0 [-short] [-e] [-sa] [-makeonly] [-hip]" exit 1 fi done @@ -70,8 +75,8 @@ cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ${opts}" $cmd; status=$? ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" -# (72/78) Two extra logs (double/float x hrd0 x inl0 + curhst) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ${opts}" +# (72/78) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six processes (no rebuild needed) +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt ${rndhst} ${opts}" $cmd; status=$? ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh index bd478452ac..de0a1e912a 100755 --- a/epochX/cudacpp/tput/teeThroughputX.sh +++ b/epochX/cudacpp/tput/teeThroughputX.sh @@ -93,6 +93,8 @@ for arg in $*; do rndgen=$arg elif [ "$arg" == "-curhst" ]; then rndgen=$arg + elif [ "$arg" == "-rorhst" ]; then + rndgen=$arg elif [ "$arg" == "-rmbhst" ]; then rmbsmp=$arg elif [ "$arg" == "-bridge" ]; then diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 1e5b427b1f..503d060237 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -187,6 +187,9 @@ while [ "$1" != "" ]; do elif [ "$1" == "-curhst" ]; then rndgen=" -${1}" shift + elif [ "$1" == "-rorhst" ]; then + rndgen=" -${1}" + shift elif [ "$1" == "-rmbhst" ]; then rmbsmp=" -${1}" shift From 7735bb12e2095b41e12037bcaa66555a672e3cfa Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:24:52 +0100 Subject: [PATCH 87/96] [jt774] in CODEGEN, fix cudacpp_src.mk for non-SM models --- .../madgraph/iolibs/template_files/gpu/cudacpp_src.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index 3d7ffb7db5..49ccf0c4e3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -264,9 +264,9 @@ $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) +cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o) ifneq ($(GPUCC),) -cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) +cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o) endif # Target (and build rules): common (src) library From a5a766134ecaf27b4e2ca7a9b229eab90c1ab7ed Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:25:39 +0100 Subject: [PATCH 88/96] [jt774] regenerate gg_tt.mad, all ok no change --- epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index 83e06dd090..f6226e7392 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0058765411376953125  +DEBUG: model prefixing takes 0.005857229232788086  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.802s -user 0m1.568s -sys 0m0.224s +real 0m1.820s +user 0m1.561s +sys 0m0.253s Code generation completed in 2 seconds ************************************************************ * * From 31cb663052e37bd1ba6bce58a0428532e1cff369 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 01:26:18 +0100 Subject: [PATCH 89/96] [jt774] regenerate heft process, ok it changed as expected --- .../heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt | 8 ++++---- epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index a6ac85eeef..17b7cd7789 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -165,7 +165,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.454s -user 0m0.374s -sys 0m0.068s -Code generation completed in 0 seconds +real 0m0.448s +user 0m0.390s +sys 0m0.053s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk index b2b9da5288..fb8da8830b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk @@ -264,9 +264,9 @@ $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) #------------------------------------------------------------------------------- -cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) +cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o) ifneq ($(GPUCC),) -cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) +cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o) endif # Target (and build rules): common (src) library From 9ed3aaf637941b12762b6d7a0bf51246f7a63cba Mon Sep 17 00:00:00 2001 From: Olivier Mattelaer Date: Wed, 31 Jan 2024 15:56:33 +0100 Subject: [PATCH 90/96] fixing typo in the allowed option --- .../cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' From 33322e07bb1a5267a21a5a6c5635ba25f306c686 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 10:40:02 +0100 Subject: [PATCH 91/96] [jt774] rerun 78 tput tests on itscrd90, all ok [NB using code generated before Olivier's commit] STARTED AT Tue Jan 30 01:27:55 AM CET 2024 ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Tue Jan 30 05:12:08 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Tue Jan 30 05:41:46 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Tue Jan 30 05:52:05 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Tue Jan 30 05:55:35 AM CET 2024 [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Tue Jan 30 05:59:00 AM CET 2024 [Status=0] --- .../log_eemumu_mad_d_inl0_hrd0.txt | 227 +++++++++------ .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 234 +++++++++------ .../log_eemumu_mad_d_inl0_hrd0_common.txt | 213 ++++++++------ .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 210 +++++++++----- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 229 +++++++++------ .../log_eemumu_mad_d_inl0_hrd1.txt | 227 +++++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 225 ++++++++------ .../log_eemumu_mad_d_inl1_hrd1.txt | 225 ++++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 239 ++++++++------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 246 +++++++++------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 227 +++++++++------ .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 222 +++++++++----- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 241 ++++++++------- .../log_eemumu_mad_f_inl0_hrd1.txt | 239 ++++++++------- .../log_eemumu_mad_f_inl1_hrd0.txt | 237 +++++++++------ .../log_eemumu_mad_f_inl1_hrd1.txt | 237 +++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 227 +++++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 227 +++++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 227 +++++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 234 +++++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 213 ++++++++------ .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 210 +++++++++----- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 229 +++++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 227 +++++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 225 ++++++++------ .../log_ggtt_mad_d_inl1_hrd1.txt | 225 ++++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 245 +++++++++------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 252 +++++++++------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 239 ++++++++------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 228 ++++++++++----- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 247 +++++++++------- .../log_ggtt_mad_f_inl0_hrd1.txt | 245 +++++++++------- .../log_ggtt_mad_f_inl1_hrd0.txt | 239 ++++++++------- .../log_ggtt_mad_f_inl1_hrd1.txt | 239 ++++++++------- .../log_ggtt_mad_m_inl0_hrd0.txt | 225 ++++++++------ .../log_ggtt_mad_m_inl0_hrd1.txt | 225 ++++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 250 +++++++++------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 258 ++++++++++------- .../log_ggttg_mad_d_inl0_hrd1.txt | 250 +++++++++------- .../log_ggttg_mad_f_inl0_hrd0.txt | 264 ++++++++++------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 272 ++++++++++------- .../log_ggttg_mad_f_inl0_hrd1.txt | 264 ++++++++++------- .../log_ggttg_mad_m_inl0_hrd0.txt | 250 +++++++++------- .../log_ggttg_mad_m_inl0_hrd1.txt | 250 +++++++++------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 250 +++++++++------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 258 ++++++++++------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 234 +++++++++------ .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 228 ++++++++++----- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 253 +++++++++------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 250 +++++++++------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 252 +++++++++------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 252 +++++++++------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 266 ++++++++++------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 274 +++++++++++------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 258 ++++++++++------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 244 ++++++++++------ .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 269 ++++++++++------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 266 ++++++++++------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 270 +++++++++-------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 270 +++++++++-------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 246 +++++++++------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 246 +++++++++------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 250 +++++++++------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 258 ++++++++++------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 250 +++++++++------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 266 ++++++++++------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 274 +++++++++++------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 266 ++++++++++------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 250 +++++++++------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 250 +++++++++------- .../log_gqttq_mad_d_inl0_hrd0.txt | 255 +++++++++++----- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 265 ++++++++++++----- .../log_gqttq_mad_d_inl0_hrd1.txt | 255 +++++++++++----- .../log_gqttq_mad_f_inl0_hrd0.txt | 255 +++++++++++----- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 265 ++++++++++++----- .../log_gqttq_mad_f_inl0_hrd1.txt | 255 +++++++++++----- .../log_gqttq_mad_m_inl0_hrd0.txt | 255 +++++++++++----- .../log_gqttq_mad_m_inl0_hrd1.txt | 255 +++++++++++----- 78 files changed, 11668 insertions(+), 7406 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index c73ffa26a2..15dbd5f8d1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:07:36 +DATE: 2024-01-30_04:51:46 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.187163e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.113051e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.341239e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.085142 sec - 15,401,484,650 cycles:u # 2.956 GHz (75.00%) - 53,460,298 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.25%) - 6,922,628,832 stalled-cycles-backend:u # 44.95% backend cycles idle (75.11%) - 11,641,711,170 instructions:u # 0.76 insn per cycle - # 0.59 stalled cycles per insn (74.84%) - 5.625918161 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.572573e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281942e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.116391e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.839714 sec + 2,719,217,340 cycles # 2.832 GHz + 4,277,615,433 instructions # 1.57 insn per cycle + 1.175143775 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.250244e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.427953e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.427953e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.755044 sec - 19,517,650,978 cycles:u # 3.375 GHz (74.97%) - 52,736,956 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.91%) - 64,319,790 stalled-cycles-backend:u # 0.33% backend cycles idle (74.91%) - 47,058,402,875 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 5.786528246 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.879157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147243e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.790847 sec + 19,539,640,504 cycles # 2.876 GHz + 46,935,351,432 instructions # 2.40 insn per cycle + 6.804517518 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.935680e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.431114e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.431114e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.970217 sec - 13,235,454,941 cycles:u # 3.310 GHz (74.99%) - 49,338,655 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.99%) - 973,459,800 stalled-cycles-backend:u # 7.35% backend cycles idle (74.99%) - 31,182,239,136 instructions:u # 2.36 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 4.002630879 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.545376e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021398e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.488904 sec + 12,869,370,410 cycles # 2.864 GHz + 31,186,180,279 instructions # 2.42 insn per cycle + 4.505888529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.652416e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.531228e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.531228e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.091469 sec - 10,182,202,026 cycles:u # 3.264 GHz (74.89%) - 48,463,224 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.87%) - 450,467,756 stalled-cycles-backend:u # 4.42% backend cycles idle (75.00%) - 19,351,447,734 instructions:u # 1.90 insn per cycle - # 0.02 stalled cycles per insn (75.13%) - 3.123604900 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.955981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.735873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.735873e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.633222 sec + 10,032,348,170 cycles # 2.758 GHz + 19,481,701,848 instructions # 1.94 insn per cycle + 3.651370321 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.070263e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.978600e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.978600e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.453661 sec + 9,572,367,477 cycles # 2.767 GHz + 18,943,715,958 instructions # 1.98 insn per cycle + 3.473553059 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.819162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.469996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.469996e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.879359 sec + 8,193,098,191 cycles # 2.110 GHz + 15,513,331,501 instructions # 1.89 insn per cycle + 3.898953032 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 2ece6f60cd..f78ea7251e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:45:03 +DATE: 2024-01-30_05:45:26 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.489385e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.351558e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.351558e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.536230 sec - 18,358,209,500 cycles:u # 3.296 GHz (74.96%) - 121,997,790 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.90%) - 6,997,087,110 stalled-cycles-backend:u # 38.11% backend cycles idle (74.96%) - 17,155,260,780 instructions:u # 0.93 insn per cycle - # 0.41 stalled cycles per insn (75.00%) - 5.598707861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.460171e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.485962e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.319187 sec + 7,341,770,811 cycles # 2.857 GHz + 13,101,723,847 instructions # 1.78 insn per cycle + 2.628471382 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.233749e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.406395e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.406395e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.930356 sec - 19,949,452,995 cycles:u # 3.342 GHz (74.94%) - 52,759,407 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.94%) - 115,219,441 stalled-cycles-backend:u # 0.58% backend cycles idle (74.96%) - 47,211,806,753 instructions:u # 2.37 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.971481804 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.576223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107198e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107198e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 7.190455 sec + 20,703,597,440 cycles # 2.877 GHz + 47,160,901,733 instructions # 2.28 insn per cycle + 7.198222207 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.865640e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.331378e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.331378e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.232961 sec - 13,973,241,123 cycles:u # 3.271 GHz (74.91%) - 50,351,129 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.91%) - 1,042,859,941 stalled-cycles-backend:u # 7.46% backend cycles idle (74.94%) - 31,893,025,253 instructions:u # 2.28 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 4.276043039 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.473769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.897978e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.897978e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.898106 sec + 14,084,591,919 cycles # 2.873 GHz + 32,028,151,491 instructions # 2.27 insn per cycle + 4.906157596 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.549092e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345903e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345903e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.336339 sec - 10,839,641,195 cycles:u # 3.211 GHz (74.91%) - 49,204,874 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.88%) - 476,585,531 stalled-cycles-backend:u # 4.40% backend cycles idle (74.97%) - 20,620,320,024 instructions:u # 1.90 insn per cycle - # 0.02 stalled cycles per insn (75.09%) - 3.379348572 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.834615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.502061e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.502061e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.065584 sec + 11,264,443,170 cycles # 2.767 GHz + 20,844,723,129 instructions # 1.85 insn per cycle + 4.073296839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.930005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.695920e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.695920e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.900573 sec + 10,821,072,419 cycles # 2.771 GHz + 20,305,054,668 instructions # 1.88 insn per cycle + 3.908355042 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.707724e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.274502e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274502e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.333313 sec + 9,497,951,325 cycles # 2.189 GHz + 16,666,820,850 instructions # 1.75 insn per cycle + 4.341233179 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index baa89d7a03..f072467bfa 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:55:08 +DATE: 2024-01-30_05:59:18 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.146697e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104851e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.333756e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.483909e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.562012e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.071690e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.653795 sec - 15,325,067,781 cycles:u # 3.273 GHz (75.06%) - 53,542,980 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.08%) - 6,940,523,786 stalled-cycles-backend:u # 45.29% backend cycles idle (75.08%) - 11,587,453,443 instructions:u # 0.76 insn per cycle - # 0.60 stalled cycles per insn (75.05%) - 4.706698959 seconds time elapsed +TOTAL : 1.371489 sec + 4,620,404,364 cycles # 2.861 GHz + 7,153,271,516 instructions # 1.55 insn per cycle + 1.672602435 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.250607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.427612e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.427612e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.952512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155636e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.756745 sec - 19,505,733,856 cycles:u # 3.372 GHz (74.98%) - 52,621,983 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.97%) - 64,317,116 stalled-cycles-backend:u # 0.33% backend cycles idle (74.98%) - 47,050,652,034 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 5.788198860 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.107803 sec + 20,592,800,911 cycles # 2.895 GHz + 47,037,031,319 instructions # 2.28 insn per cycle + 7.114495241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.919218e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.430888e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.430888e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.558277e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.038534e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.038534e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.000634 sec - 13,345,658,667 cycles:u # 3.312 GHz (74.99%) - 50,131,805 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.99%) - 998,870,131 stalled-cycles-backend:u # 7.48% backend cycles idle (74.99%) - 31,165,536,709 instructions:u # 2.34 insn per cycle - # 0.03 stalled cycles per insn (75.00%) - 4.032008178 seconds time elapsed +TOTAL : 4.822482 sec + 13,870,774,877 cycles # 2.874 GHz + 31,186,249,487 instructions # 2.25 insn per cycle + 4.828845646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.655792e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.530860e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.530860e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951724e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.730389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730389e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.092974 sec - 10,173,625,147 cycles:u # 3.259 GHz (74.78%) - 47,880,835 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.78%) - 452,668,828 stalled-cycles-backend:u # 4.45% backend cycles idle (74.93%) - 19,370,864,981 instructions:u # 1.90 insn per cycle - # 0.02 stalled cycles per insn (75.06%) - 3.124544479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +TOTAL : 4.015384 sec + 11,119,337,735 cycles # 2.766 GHz + 19,381,852,554 instructions # 1.74 insn per cycle + 4.022009475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.063314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951443e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.845408 sec + 10,662,597,452 cycles # 2.769 GHz + 18,643,141,459 instructions # 1.75 insn per cycle + 3.852109381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.811483e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.460421e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.460421e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.272939 sec + 9,279,488,955 cycles # 2.169 GHz + 15,212,537,826 instructions # 1.64 insn per cycle + 4.279485071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 536dc86c3a..a6db5de426 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:53:53 +DATE: 2024-01-30_05:55:54 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.492089e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.565509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.085712e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.999229 sec + 3,503,665,967 cycles # 2.851 GHz + 7,040,796,455 instructions # 2.01 insn per cycle + 1.289089254 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 46,495,171 cycles:u # 0.560 GHz (71.08%) - 42,178 stalled-cycles-frontend:u # 0.09% frontend cycles idle (71.08%) - 540,867 stalled-cycles-backend:u # 1.16% backend cycles idle (71.78%) - 37,772,172 instructions:u # 0.81 insn per cycle - # 0.01 stalled cycles per insn (72.86%) - 0.084519732 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted - 41,203,264 cycles:u # 2.003 GHz (61.15%) - 56,042 stalled-cycles-frontend:u # 0.14% frontend cycles idle (61.15%) - 362,269 stalled-cycles-backend:u # 0.88% backend cycles idle (61.52%) - 48,313,050 instructions:u # 1.17 insn per cycle - # 0.01 stalled cycles per insn (74.89%) - 0.021587347 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.897604e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152411e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.777699 sec + 19,525,012,140 cycles # 2.879 GHz + 46,935,602,227 instructions # 2.40 insn per cycle + 6.784496054 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted - 53,509,383 cycles:u # 2.609 GHz (61.03%) - 45,277 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.03%) - 627,896 stalled-cycles-backend:u # 1.17% backend cycles idle (61.03%) - 44,491,939 instructions:u # 0.83 insn per cycle - # 0.01 stalled cycles per insn (61.50%) - 0.021612075 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.565929e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.046315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046315e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.431371 sec + 12,844,580,525 cycles # 2.895 GHz + 31,183,505,413 instructions # 2.43 insn per cycle + 4.438022505 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted - 51,566,973 cycles:u # 2.530 GHz (60.78%) - 45,600 stalled-cycles-frontend:u # 0.09% frontend cycles idle (60.78%) - 606,684 stalled-cycles-backend:u # 1.18% backend cycles idle (60.78%) - 45,251,251 instructions:u # 0.88 insn per cycle - # 0.01 stalled cycles per insn (63.00%) - 0.021400674 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.956069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738681e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738681e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.632695 sec + 10,040,197,478 cycles # 2.761 GHz + 19,480,754,402 instructions # 1.94 insn per cycle + 3.639336589 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068909e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.973543e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.973543e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.456026 sec + 9,583,252,780 cycles # 2.770 GHz + 18,943,299,087 instructions # 1.98 insn per cycle + 3.462550493 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.820163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.473451e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.473451e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.874228 sec + 8,184,248,497 cycles # 2.110 GHz + 15,512,168,002 instructions # 1.90 insn per cycle + 3.880483923 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index da2e035b05..4dded3e862 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:51:13 +DATE: 2024-01-30_05:52:26 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.528826e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.088503e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.316519e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.364210 sec - 17,868,547,478 cycles:u # 3.311 GHz (74.97%) - 119,610,671 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.97%) - 6,902,829,832 stalled-cycles-backend:u # 38.63% backend cycles idle (74.94%) - 16,786,814,157 instructions:u # 0.94 insn per cycle - # 0.41 stalled cycles per insn (74.96%) - 5.413225567 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.831383e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.529080e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.990768e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.936415 sec + 6,196,996,673 cycles # 2.858 GHz + 11,355,646,527 instructions # 1.83 insn per cycle + 2.226164304 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.249977e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.427403e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.427403e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.760731 sec - 19,510,622,559 cycles:u # 3.370 GHz (74.99%) - 53,071,491 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.99%) - 61,693,741 stalled-cycles-backend:u # 0.32% backend cycles idle (74.99%) - 47,039,619,399 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 5.792220139 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.923680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152570e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152570e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.755774 sec + 19,508,468,124 cycles # 2.886 GHz + 46,934,079,079 instructions # 2.41 insn per cycle + 6.762162730 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.917858e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.433331e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.433331e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.002580 sec - 13,351,914,529 cycles:u # 3.312 GHz (75.00%) - 46,277,153 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.00%) - 1,061,232,155 stalled-cycles-backend:u # 7.95% backend cycles idle (75.00%) - 31,154,424,323 instructions:u # 2.33 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 4.033810883 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.560350e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041132e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.445978 sec + 12,824,682,223 cycles # 2.881 GHz + 31,183,984,467 instructions # 2.43 insn per cycle + 4.452647644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.653997e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.532085e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.532085e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.094943 sec - 10,158,909,804 cycles:u # 3.252 GHz (74.91%) - 47,904,950 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.93%) - 453,479,389 stalled-cycles-backend:u # 4.46% backend cycles idle (74.93%) - 19,410,239,671 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (74.92%) - 3.126353323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.945035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.719021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719021e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.651562 sec + 10,054,417,482 cycles # 2.750 GHz + 19,480,651,159 instructions # 1.94 insn per cycle + 3.658175830 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.065244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.964476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.964476e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.463334 sec + 9,575,609,591 cycles # 2.761 GHz + 18,944,249,093 instructions # 1.98 insn per cycle + 3.469928809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.819790e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.476564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.476564e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.875473 sec + 8,194,000,405 cycles # 2.112 GHz + 15,512,267,676 instructions # 1.89 insn per cycle + 3.882168596 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 441b1bb6f4..9238de7bbb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:08:02 +DATE: 2024-01-30_04:52:22 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.750096e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.589937e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.910057e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.645918 sec - 15,351,943,429 cycles:u # 3.290 GHz (75.00%) - 53,595,143 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.02%) - 6,958,307,311 stalled-cycles-backend:u # 45.33% backend cycles idle (74.92%) - 11,532,854,200 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (74.87%) - 4.697581630 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.433269e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304294e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.211626e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.708580 sec + 2,678,035,833 cycles # 2.828 GHz + 4,219,258,618 instructions # 1.58 insn per cycle + 1.025396427 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165206E-002 +Relative difference = 1.027708011645137e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.322984e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.523214e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.523214e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.479199 sec - 18,560,763,128 cycles:u # 3.371 GHz (74.95%) - 52,081,225 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.02%) - 62,524,949 stalled-cycles-backend:u # 0.34% backend cycles idle (75.02%) - 44,776,264,781 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 5.509932315 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.057712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.240764e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240764e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.363915 sec + 18,420,155,453 cycles # 2.892 GHz + 44,716,833,361 instructions # 2.43 insn per cycle + 6.376789264 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.012355e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.562911e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.562911e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.845256 sec - 12,804,475,737 cycles:u # 3.306 GHz (75.01%) - 50,272,921 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.01%) - 87,328,770 stalled-cycles-backend:u # 0.68% backend cycles idle (75.01%) - 30,087,742,697 instructions:u # 2.35 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.877788096 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.624136e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.147437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.147437e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.286124 sec + 12,429,118,549 cycles # 2.897 GHz + 30,107,231,858 instructions # 2.42 insn per cycle + 4.302706533 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.598596e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.426861e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.426861e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.141732 sec - 10,294,444,223 cycles:u # 3.247 GHz (75.02%) - 44,149,693 stalled-cycles-frontend:u # 0.43% frontend cycles idle (75.02%) - 284,584,719 stalled-cycles-backend:u # 2.76% backend cycles idle (75.02%) - 19,044,099,895 instructions:u # 1.85 insn per cycle - # 0.01 stalled cycles per insn (74.91%) - 3.173982174 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.942189e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.705004e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.705004e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.656079 sec + 10,127,428,804 cycles # 2.766 GHz + 19,115,519,637 instructions # 1.89 insn per cycle + 3.673885868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.094903e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.039710e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039710e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.417483 sec + 9,477,381,758 cycles # 2.768 GHz + 18,489,351,216 instructions # 1.95 insn per cycle + 3.434681568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.183418e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.193735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193735e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.298580 sec + 7,210,521,695 cycles # 2.182 GHz + 13,864,693,183 instructions # 1.92 insn per cycle + 3.315590461 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index d60d821d9f..09e3552971 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:31:07 +DATE: 2024-01-30_05:33:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.321527e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.099297e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.328708e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.668893 sec - 15,347,165,163 cycles:u # 3.281 GHz (75.06%) - 53,650,593 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.11%) - 6,940,894,773 stalled-cycles-backend:u # 45.23% backend cycles idle (75.00%) - 11,573,190,238 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (74.94%) - 4.722818283 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.454720e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.590982e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126095e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.682889 sec + 2,611,559,388 cycles # 2.831 GHz + 3,986,840,129 instructions # 1.53 insn per cycle + 0.986209294 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.781161e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.162763e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.162763e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.256323 sec - 14,257,338,770 cycles:u # 3.327 GHz (74.99%) - 54,041,283 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.99%) - 501,869,670 stalled-cycles-backend:u # 3.52% backend cycles idle (74.99%) - 36,747,633,350 instructions:u # 2.58 insn per cycle - # 0.01 stalled cycles per insn (75.00%) - 4.288355678 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.350945e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.669369e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.669369e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.075739 sec + 14,632,134,397 cycles # 2.880 GHz + 36,697,212,873 instructions # 2.51 insn per cycle + 5.082665504 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.395527e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.236314e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.236314e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.348575 sec - 11,075,223,056 cycles:u # 3.279 GHz (74.94%) - 49,913,517 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.92%) - 64,007,549 stalled-cycles-backend:u # 0.58% backend cycles idle (74.92%) - 24,757,662,826 instructions:u # 2.24 insn per cycle - # 0.00 stalled cycles per insn (74.90%) - 3.381886159 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.975416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.812212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.812212e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.599579 sec + 10,391,716,980 cycles # 2.883 GHz + 24,753,509,930 instructions # 2.38 insn per cycle + 3.606361950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.003345e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.172522e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.172522e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.815790 sec - 9,191,114,084 cycles:u # 3.230 GHz (74.98%) - 48,061,079 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.98%) - 531,733,278 stalled-cycles-backend:u # 5.79% backend cycles idle (74.86%) - 16,897,381,469 instructions:u # 1.84 insn per cycle - # 0.03 stalled cycles per insn (74.86%) - 2.849085348 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.206864e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274609e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274609e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.273737 sec + 8,884,033,270 cycles # 2.722 GHz + 16,960,441,009 instructions # 1.91 insn per cycle + 3.280558312 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.436675e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.780065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.780065e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.997375 sec + 8,315,936,313 cycles # 2.769 GHz + 16,298,181,743 instructions # 1.96 insn per cycle + 3.004046425 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.987391e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.794180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.794180e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.583817 sec + 7,670,874,044 cycles # 2.137 GHz + 14,352,448,248 instructions # 1.87 insn per cycle + 3.590538974 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index ec20f0a107..508008a0c5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:31:30 +DATE: 2024-01-30_05:34:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.908895e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.595304e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.915035e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.637721 sec - 15,337,436,005 cycles:u # 3.285 GHz (75.00%) - 53,919,682 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.91%) - 6,941,948,091 stalled-cycles-backend:u # 45.26% backend cycles idle (75.05%) - 11,535,932,480 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (75.04%) - 4.692843363 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.464301e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.594213e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.177261e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.680513 sec + 2,594,214,158 cycles # 2.833 GHz + 3,992,420,158 instructions # 1.54 insn per cycle + 0.978034885 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165206E-002 +Relative difference = 1.027708011645137e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.437871e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.219329e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.219329e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.306988 sec - 10,920,128,927 cycles:u # 3.273 GHz (74.97%) - 51,678,826 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.06%) - 49,333,079 stalled-cycles-backend:u # 0.45% backend cycles idle (75.07%) - 28,405,049,430 instructions:u # 2.60 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 3.339028113 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.895468e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.581482e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.581482e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.738704 sec + 10,794,188,443 cycles # 2.885 GHz + 28,356,720,092 instructions # 2.63 insn per cycle + 3.745371478 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.603844e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.647404e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.647404e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.136509 sec - 10,320,360,832 cycles:u # 3.260 GHz (75.01%) - 49,989,115 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.99%) - 70,471,619 stalled-cycles-backend:u # 0.68% backend cycles idle (74.99%) - 21,503,541,481 instructions:u # 2.08 insn per cycle - # 0.00 stalled cycles per insn (74.88%) - 3.169925065 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.231818e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.360148e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.360148e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.232648 sec + 9,331,358,518 cycles # 2.882 GHz + 21,587,159,141 instructions # 2.31 insn per cycle + 3.239331570 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.294610e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.771701e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.771701e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.631363 sec - 8,555,088,028 cycles:u # 3.215 GHz (74.96%) - 48,783,917 stalled-cycles-frontend:u # 0.57% frontend cycles idle (75.06%) - 150,017,587 stalled-cycles-backend:u # 1.75% backend cycles idle (75.06%) - 15,831,128,730 instructions:u # 1.85 insn per cycle - # 0.01 stalled cycles per insn (75.05%) - 2.664837791 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.406271e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.696326e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.696326e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.030114 sec + 8,381,289,955 cycles # 2.761 GHz + 15,943,872,727 instructions # 1.90 insn per cycle + 3.036686774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.611770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.211566e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.211566e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.823652 sec + 7,834,743,570 cycles # 2.770 GHz + 15,370,444,400 instructions # 1.96 insn per cycle + 2.830226684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.110110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.044152e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.044152e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.399029 sec + 7,342,854,469 cycles # 2.157 GHz + 13,880,932,107 instructions # 1.89 insn per cycle + 3.405583219 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 7990668dd5..30054d0a8f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:08:27 +DATE: 2024-01-30_04:52:57 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.857196e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.216060e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.974564e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.521435 sec - 14,977,880,374 cycles:u # 3.295 GHz (75.03%) - 53,787,695 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.04%) - 6,894,589,003 stalled-cycles-backend:u # 46.03% backend cycles idle (75.03%) - 11,454,419,619 instructions:u # 0.76 insn per cycle - # 0.60 stalled cycles per insn (75.05%) - 4.570327482 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.089125e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083340e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291553e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.592260 sec + 2,336,196,912 cycles # 2.833 GHz + 3,633,132,034 instructions # 1.56 insn per cycle + 0.902800684 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.418580e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645597e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645597e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.103130 sec - 17,318,337,356 cycles:u # 3.377 GHz (74.95%) - 39,375,779 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.02%) - 37,405,246 stalled-cycles-backend:u # 0.22% backend cycles idle (75.05%) - 47,178,870,494 instructions:u # 2.72 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 5.130781235 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.035118e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.220346e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220346e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.458158 sec + 18,623,778,658 cycles # 2.882 GHz + 47,047,597,520 instructions # 2.53 insn per cycle + 6.468376899 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.949128e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.190153e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.190153e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.797502 sec - 9,212,831,935 cycles:u # 3.264 GHz (75.06%) - 41,259,262 stalled-cycles-frontend:u # 0.45% frontend cycles idle (75.06%) - 630,902,387 stalled-cycles-backend:u # 6.85% backend cycles idle (75.06%) - 22,090,019,438 instructions:u # 2.40 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 2.827553242 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.220597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.402817e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.402817e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.207438 sec + 9,259,856,985 cycles # 2.882 GHz + 22,093,069,841 instructions # 2.39 insn per cycle + 3.223491423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.419503e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.013338e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.013338e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.504134 sec - 8,196,207,955 cycles:u # 3.241 GHz (74.99%) - 39,538,728 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) - 1,438,596,268 stalled-cycles-backend:u # 17.55% backend cycles idle (75.02%) - 15,521,964,298 instructions:u # 1.89 insn per cycle - # 0.09 stalled cycles per insn (75.02%) - 2.533039511 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.440699e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.781387e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781387e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.957121 sec + 8,193,990,799 cycles # 2.766 GHz + 15,625,791,555 instructions # 1.91 insn per cycle + 2.973833384 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.532783e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.026282e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.026282e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.864857 sec + 7,877,312,491 cycles # 2.746 GHz + 15,298,553,606 instructions # 1.94 insn per cycle + 2.880238416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.515538e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.925634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.925634e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.878384 sec + 6,411,016,127 cycles # 2.223 GHz + 12,624,518,195 instructions # 1.97 insn per cycle + 2.897065980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 673b97ee3f..cb0960cef7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:45:30 +DATE: 2024-01-30_05:46:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.594584e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312331e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.312331e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.342740 sec - 17,767,754,357 cycles:u # 3.310 GHz (74.97%) - 118,735,343 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.97%) - 6,951,532,407 stalled-cycles-backend:u # 39.12% backend cycles idle (74.97%) - 17,078,729,274 instructions:u # 0.96 insn per cycle - # 0.41 stalled cycles per insn (74.94%) - 5.393944782 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.896245e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.389243e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.734031 sec + 5,668,072,364 cycles # 2.868 GHz + 10,146,395,921 instructions # 1.79 insn per cycle + 2.033339529 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.408640e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632906e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.632906e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.190782 sec - 17,512,984,466 cycles:u # 3.354 GHz (74.97%) - 39,606,603 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.03%) - 88,796,249 stalled-cycles-backend:u # 0.51% backend cycles idle (75.03%) - 47,394,033,970 instructions:u # 2.71 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 5.223039115 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.023723e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.199962e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.631109 sec + 19,198,970,802 cycles # 2.893 GHz + 47,195,604,267 instructions # 2.46 insn per cycle + 6.638520301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.871178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.029124e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.029124e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.943122 sec - 9,626,438,902 cycles:u # 3.237 GHz (75.01%) - 41,048,676 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.98%) - 690,269,864 stalled-cycles-backend:u # 7.17% backend cycles idle (74.98%) - 23,392,634,777 instructions:u # 2.43 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 2.976979266 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.130711e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.183569e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.183569e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.452422 sec + 9,989,387,225 cycles # 2.889 GHz + 23,431,077,272 instructions # 2.35 insn per cycle + 3.459894158 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.317484e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.804012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.804012e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.641383 sec - 8,570,286,537 cycles:u # 3.208 GHz (74.87%) - 39,777,448 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.85%) - 1,467,952,662 stalled-cycles-backend:u # 17.13% backend cycles idle (74.90%) - 16,609,873,977 instructions:u # 1.94 insn per cycle - # 0.09 stalled cycles per insn (75.05%) - 2.675246600 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.341081e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.547294e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.547294e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.196012 sec + 8,906,176,925 cycles # 2.782 GHz + 16,751,991,837 instructions # 1.88 insn per cycle + 3.203321936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.434021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786427e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786427e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.093664 sec + 8,635,370,178 cycles # 2.786 GHz + 16,424,138,356 instructions # 1.90 insn per cycle + 3.101132741 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.383314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611676e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611676e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.145258 sec + 7,151,980,153 cycles # 2.270 GHz + 13,850,467,115 instructions # 1.94 insn per cycle + 3.152590479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index b4a4449cb7..26c818590d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:55:33 +DATE: 2024-01-30_05:59:57 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.696384e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.226897e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.984829e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.529425 sec - 14,979,431,077 cycles:u # 3.289 GHz (75.05%) - 54,269,019 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.05%) - 7,021,630,686 stalled-cycles-backend:u # 46.88% backend cycles idle (74.98%) - 11,114,469,756 instructions:u # 0.74 insn per cycle - # 0.63 stalled cycles per insn (74.97%) - 4.577625816 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.303596e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175288e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.243996e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.207260 sec + 4,082,591,214 cycles # 2.858 GHz + 6,515,356,659 instructions # 1.60 insn per cycle + 1.486873600 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.419098e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.646169e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.646169e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.039099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222240e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222240e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.101861 sec - 17,329,919,651 cycles:u # 3.380 GHz (74.94%) - 39,371,838 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.01%) - 34,710,609 stalled-cycles-backend:u # 0.20% backend cycles idle (75.04%) - 47,183,984,338 instructions:u # 2.72 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.130521932 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.777770 sec + 19,569,392,860 cycles # 2.885 GHz + 47,229,099,277 instructions # 2.41 insn per cycle + 6.784024049 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.927016e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.134988e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.134988e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.224011e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.394362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.394362e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.817773 sec - 9,282,007,688 cycles:u # 3.265 GHz (74.96%) - 40,680,415 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.96%) - 649,301,144 stalled-cycles-backend:u # 7.00% backend cycles idle (74.96%) - 22,192,724,454 instructions:u # 2.39 insn per cycle - # 0.03 stalled cycles per insn (74.84%) - 2.845062092 seconds time elapsed +TOTAL : 3.543713 sec + 10,250,573,649 cycles # 2.890 GHz + 22,173,775,935 instructions # 2.16 insn per cycle + 3.550219999 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.418424e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.011922e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.011922e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.458663e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.813529e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.813529e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.504992 sec - 8,200,675,323 cycles:u # 3.241 GHz (74.96%) - 39,490,772 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) - 1,444,236,897 stalled-cycles-backend:u # 17.61% backend cycles idle (75.02%) - 15,520,221,160 instructions:u # 1.89 insn per cycle - # 0.09 stalled cycles per insn (75.03%) - 2.532599396 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +TOTAL : 3.280080 sec + 9,161,776,432 cycles # 2.789 GHz + 15,536,168,479 instructions # 1.70 insn per cycle + 3.286291256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.554649e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.077981e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.077981e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.189150 sec + 8,891,496,493 cycles # 2.784 GHz + 15,006,164,122 instructions # 1.69 insn per cycle + 3.195486341 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.516232e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.934012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.934012e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.229540 sec + 7,432,998,054 cycles # 2.298 GHz + 12,333,053,960 instructions # 1.66 insn per cycle + 3.235962697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 6e6cd5e5eb..90d7f62db4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:54:01 +DATE: 2024-01-30_05:56:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 54,856,998 cycles:u # 2.647 GHz (61.43%) - 47,618 stalled-cycles-frontend:u # 0.09% frontend cycles idle (61.44%) - 635,574 stalled-cycles-backend:u # 1.16% backend cycles idle (61.43%) - 43,586,624 instructions:u # 0.79 insn per cycle - # 0.01 stalled cycles per insn (61.91%) - 0.021324806 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.305141e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181296e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274552e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.867173 sec + 3,085,877,327 cycles # 2.830 GHz + 6,333,420,740 instructions # 2.05 insn per cycle + 1.147827940 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted - 42,059,852 cycles:u # 2.045 GHz (61.13%) - 62,739 stalled-cycles-frontend:u # 0.15% frontend cycles idle (61.13%) - 384,873 stalled-cycles-backend:u # 0.92% backend cycles idle (61.30%) - 47,983,719 instructions:u # 1.14 insn per cycle - # 0.01 stalled cycles per insn (74.16%) - 0.021642865 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.039832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222763e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.429819 sec + 18,561,263,651 cycles # 2.885 GHz + 47,048,334,209 instructions # 2.53 insn per cycle + 6.436326918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted - 55,264,236 cycles:u # 2.696 GHz (61.00%) - 45,621 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.01%) - 619,143 stalled-cycles-backend:u # 1.12% backend cycles idle (61.01%) - 42,869,609 instructions:u # 0.78 insn per cycle - # 0.01 stalled cycles per insn (61.51%) - 0.021507280 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.222730e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.393980e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.393980e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.203768 sec + 9,238,443,218 cycles # 2.879 GHz + 22,092,244,938 instructions # 2.39 insn per cycle + 3.210105048 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted - 51,252,361 cycles:u # 2.482 GHz (61.29%) - 43,034 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.30%) - 555,172 stalled-cycles-backend:u # 1.08% backend cycles idle (61.30%) - 45,320,101 instructions:u # 0.88 insn per cycle - # 0.01 stalled cycles per insn (63.05%) - 0.021723512 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.418509e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.733909e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733909e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.982066 sec + 8,185,679,734 cycles # 2.740 GHz + 15,625,107,028 instructions # 1.91 insn per cycle + 2.988278371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.558846e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085053e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.831486 sec + 7,894,514,850 cycles # 2.783 GHz + 15,296,644,493 instructions # 1.94 insn per cycle + 2.837958999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.942507e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.942507e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.867904 sec + 6,407,267,092 cycles # 2.230 GHz + 12,623,570,741 instructions # 1.97 insn per cycle + 2.874115235 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index c52d2f3b6b..91671fa84d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:51:38 +DATE: 2024-01-30_05:53:04 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.367357e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.035509e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.758783e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.244736 sec - 17,536,732,896 cycles:u # 3.326 GHz (74.91%) - 119,175,031 stalled-cycles-frontend:u # 0.68% frontend cycles idle (74.98%) - 6,903,153,396 stalled-cycles-backend:u # 39.36% backend cycles idle (74.96%) - 16,743,149,372 instructions:u # 0.95 insn per cycle - # 0.41 stalled cycles per insn (74.97%) - 5.289589090 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.674927e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142204e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.126513e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.522177 sec + 5,014,296,377 cycles # 2.858 GHz + 9,135,258,914 instructions # 1.82 insn per cycle + 1.813578794 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.418632e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645414e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645414e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.102122 sec - 17,315,339,907 cycles:u # 3.377 GHz (74.95%) - 39,374,566 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.03%) - 39,011,627 stalled-cycles-backend:u # 0.23% backend cycles idle (75.04%) - 47,172,305,604 instructions:u # 2.72 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.129517275 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.043183e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.226572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.226572e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.408092 sec + 18,567,709,150 cycles # 2.896 GHz + 47,047,255,730 instructions # 2.53 insn per cycle + 6.414419955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.945940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.187792e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.187792e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.802417 sec - 9,259,359,761 cycles:u # 3.275 GHz (74.84%) - 41,158,501 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.87%) - 639,991,478 stalled-cycles-backend:u # 6.91% backend cycles idle (75.00%) - 22,117,141,816 instructions:u # 2.39 insn per cycle - # 0.03 stalled cycles per insn (75.11%) - 2.829728232 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.231919e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.414648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.414648e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.191678 sec + 9,246,166,536 cycles # 2.894 GHz + 22,093,449,321 instructions # 2.39 insn per cycle + 3.197919261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.418158e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.012006e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.012006e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.505742 sec - 8,205,932,650 cycles:u # 3.243 GHz (74.94%) - 39,626,552 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.03%) - 1,450,339,071 stalled-cycles-backend:u # 17.67% backend cycles idle (75.03%) - 15,521,271,212 instructions:u # 1.89 insn per cycle - # 0.09 stalled cycles per insn (75.03%) - 2.533203757 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.455778e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806689e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806689e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.938294 sec + 8,179,243,825 cycles # 2.779 GHz + 15,624,915,954 instructions # 1.91 insn per cycle + 2.944456642 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.562111e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.082808e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.082808e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.828979 sec + 7,880,998,863 cycles # 2.781 GHz + 15,296,291,599 instructions # 1.94 insn per cycle + 2.835269816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.528595e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.951135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.951135e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.864434 sec + 6,402,503,393 cycles # 2.232 GHz + 12,623,594,501 instructions # 1.97 insn per cycle + 2.870718249 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 46a5acafae..cc5700bb60 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:08:49 +DATE: 2024-01-30_04:53:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.900843e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.260720e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.038102e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.521444 sec - 14,976,634,085 cycles:u # 3.293 GHz (75.04%) - 53,985,383 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.05%) - 6,901,037,652 stalled-cycles-backend:u # 46.08% backend cycles idle (75.04%) - 11,488,651,226 instructions:u # 0.77 insn per cycle - # 0.60 stalled cycles per insn (75.06%) - 4.571984011 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.091291e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093645e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338052e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.585723 sec + 2,310,991,948 cycles # 2.835 GHz + 3,567,792,024 instructions # 1.54 insn per cycle + 0.889438316 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.542902e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.814867e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.814867e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.744903 sec - 16,072,266,413 cycles:u # 3.370 GHz (74.96%) - 39,424,962 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.01%) - 35,814,307 stalled-cycles-backend:u # 0.22% backend cycles idle (75.01%) - 44,034,562,321 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 4.772487503 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.092050e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.295990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.295990e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.138113 sec + 17,749,278,373 cycles # 2.890 GHz + 43,890,075,557 instructions # 2.47 insn per cycle + 6.149965364 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.020745e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.326154e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.326154e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.748753 sec - 9,050,825,753 cycles:u # 3.263 GHz (74.92%) - 41,840,049 stalled-cycles-frontend:u # 0.46% frontend cycles idle (74.91%) - 116,810,300 stalled-cycles-backend:u # 1.29% backend cycles idle (74.93%) - 21,679,103,378 instructions:u # 2.40 insn per cycle - # 0.01 stalled cycles per insn (74.94%) - 2.777807373 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.281832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528866e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.131918 sec + 9,063,997,030 cycles # 2.890 GHz + 21,583,444,087 instructions # 2.38 insn per cycle + 3.172631085 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.467576e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.119998e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.119998e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.476518 sec - 8,099,155,876 cycles:u # 3.238 GHz (74.96%) - 39,649,327 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.06%) - 1,776,371,700 stalled-cycles-backend:u # 21.93% backend cycles idle (75.06%) - 15,292,619,997 instructions:u # 1.89 insn per cycle - # 0.12 stalled cycles per insn (75.06%) - 2.505693559 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.471429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.850830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850830e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.922404 sec + 8,130,490,307 cycles # 2.776 GHz + 15,429,884,484 instructions # 1.90 insn per cycle + 2.941222784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.565898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093653e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093653e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.826189 sec + 7,861,694,964 cycles # 2.776 GHz + 15,087,354,653 instructions # 1.92 insn per cycle + 2.844638276 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.637184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.244046e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.244046e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.766988 sec + 6,178,543,208 cycles # 2.228 GHz + 12,245,131,195 instructions # 1.98 insn per cycle + 2.787936795 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index daac20482b..df038945e7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:31:51 +DATE: 2024-01-30_05:34:57 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.898761e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.220331e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.976189e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.520492 sec - 15,059,249,887 cycles:u # 3.311 GHz (74.86%) - 54,631,651 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.01%) - 6,985,929,963 stalled-cycles-backend:u # 46.39% backend cycles idle (75.05%) - 11,318,993,683 instructions:u # 0.75 insn per cycle - # 0.62 stalled cycles per insn (75.03%) - 4.572850398 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.293279e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189438e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292426e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.574531 sec + 2,278,103,742 cycles # 2.838 GHz + 3,559,192,155 instructions # 1.56 insn per cycle + 0.862169679 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929607e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.375061e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.375061e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.928807 sec - 13,191,423,416 cycles:u # 3.336 GHz (74.92%) - 40,090,845 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.93%) - 1,171,298,475 stalled-cycles-backend:u # 8.88% backend cycles idle (74.93%) - 38,076,177,040 instructions:u # 2.89 insn per cycle - # 0.03 stalled cycles per insn (74.94%) - 3.957305605 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.401205e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.755017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.755017e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.863284 sec + 13,757,936,316 cycles # 2.826 GHz + 37,850,126,745 instructions # 2.75 insn per cycle + 4.870249581 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039543819614E-002 -Relative difference = 3.5561191488957804e-08 +Avg ME (F77/C++) = 1.2828039414671366E-002 +Relative difference = 4.562884388571957e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.477612e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.337347e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.337347e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.477386 sec - 8,106,573,579 cycles:u # 3.238 GHz (74.86%) - 40,966,446 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.02%) - 252,074,197 stalled-cycles-backend:u # 3.11% backend cycles idle (75.08%) - 18,635,497,586 instructions:u # 2.30 insn per cycle - # 0.01 stalled cycles per insn (75.08%) - 2.507314949 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.651233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.514070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.514070e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.748031 sec + 7,929,384,882 cycles # 2.881 GHz + 18,604,713,730 instructions # 2.35 insn per cycle + 2.754502860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.859236e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.023346e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.023346e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.293893 sec - 7,466,220,256 cycles:u # 3.218 GHz (74.69%) - 40,524,921 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.69%) - 1,088,040,683 stalled-cycles-backend:u # 14.57% backend cycles idle (75.01%) - 14,239,691,356 instructions:u # 1.91 insn per cycle - # 0.08 stalled cycles per insn (75.18%) - 2.323651015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.730630e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.541231e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.541231e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.679636 sec + 7,420,774,430 cycles # 2.764 GHz + 14,339,383,869 instructions # 1.93 insn per cycle + 2.686088553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053337216261E-002 -Relative difference = 2.601499261602198e-07 +Avg ME (F77/C++) = 1.2828053246266791E-002 +Relative difference = 2.5306003563303186e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.796396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.739468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.739468e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.625810 sec + 7,304,334,176 cycles # 2.778 GHz + 13,955,275,285 instructions # 1.91 insn per cycle + 2.632447793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.601296e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.146430e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146430e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.796781 sec + 6,273,154,150 cycles # 2.239 GHz + 13,210,323,797 instructions # 2.11 insn per cycle + 2.803318258 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052540498902E-002 +Relative difference = 1.980424851420537e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 3b7030832c..784101060d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:32:12 +DATE: 2024-01-30_05:35:26 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.721468e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.255914e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.027322e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.530061 sec - 14,994,454,794 cycles:u # 3.292 GHz (75.04%) - 54,303,127 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.07%) - 6,983,878,696 stalled-cycles-backend:u # 46.58% backend cycles idle (74.99%) - 11,268,174,387 instructions:u # 0.75 insn per cycle - # 0.62 stalled cycles per insn (75.01%) - 4.578127450 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.300997e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192378e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323768e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.574497 sec + 2,274,789,999 cycles # 2.831 GHz + 3,565,149,005 instructions # 1.57 insn per cycle + 0.863293975 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.676795e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.617824e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.617824e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.020069 sec - 10,020,679,446 cycles:u # 3.290 GHz (74.84%) - 38,651,870 stalled-cycles-frontend:u # 0.39% frontend cycles idle (74.97%) - 29,763,435 stalled-cycles-backend:u # 0.30% backend cycles idle (75.05%) - 28,572,799,722 instructions:u # 2.85 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 3.048326361 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.974769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.758467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.758467e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.560083 sec + 10,128,258,424 cycles # 2.841 GHz + 28,399,859,483 instructions # 2.80 insn per cycle + 3.566485849 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.864511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.307467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.307467e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.296267 sec - 7,446,477,849 cycles:u # 3.207 GHz (74.71%) - 40,130,343 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.75%) - 35,437,360 stalled-cycles-backend:u # 0.48% backend cycles idle (75.07%) - 16,867,197,741 instructions:u # 2.27 insn per cycle - # 0.00 stalled cycles per insn (75.20%) - 2.326184137 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.921662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.360866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.360866e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.529327 sec + 7,292,501,410 cycles # 2.880 GHz + 16,787,289,445 instructions # 2.30 insn per cycle + 2.535811154 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.057425e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.514930e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.514930e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.217339 sec - 7,173,513,532 cycles:u # 3.198 GHz (75.01%) - 39,854,558 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.04%) - 383,455,538 stalled-cycles-backend:u # 5.35% backend cycles idle (75.04%) - 13,648,938,720 instructions:u # 1.90 insn per cycle - # 0.03 stalled cycles per insn (75.04%) - 2.247204928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.902980e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.008268e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.008268e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.546290 sec + 7,099,294,688 cycles # 2.783 GHz + 13,729,465,706 instructions # 1.93 insn per cycle + 2.552602290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053331759293E-002 -Relative difference = 2.597245327285885e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.894124e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.023412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023412e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.549860 sec + 7,037,352,059 cycles # 2.755 GHz + 13,462,222,302 instructions # 1.91 insn per cycle + 2.556338558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.741921e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.505340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.505340e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.671598 sec + 6,046,764,080 cycles # 2.259 GHz + 12,911,501,907 instructions # 2.14 insn per cycle + 2.677952936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 82cbbddff3..7a09642823 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:09:11 +DATE: 2024-01-30_04:54:00 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.320880e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.106815e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.335809e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.650032 sec - 15,393,538,908 cycles:u # 3.291 GHz (74.99%) - 53,951,381 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.87%) - 6,953,241,438 stalled-cycles-backend:u # 45.17% backend cycles idle (74.87%) - 11,556,335,890 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (75.02%) - 4.701473801 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.434258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.171049e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.704261 sec + 2,701,570,097 cycles # 2.831 GHz + 4,244,340,283 instructions # 1.57 insn per cycle + 1.033944641 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590281E-002 -Relative difference = 7.67145406542181e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.243334e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.416802e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.416802e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.782818 sec - 19,603,752,476 cycles:u # 3.374 GHz (74.95%) - 52,047,296 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.96%) - 155,941,389 stalled-cycles-backend:u # 0.80% backend cycles idle (74.95%) - 47,111,954,713 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 5.813875962 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.829628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139787e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.139787e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.819159 sec + 19,690,827,956 cycles # 2.885 GHz + 46,971,779,576 instructions # 2.39 insn per cycle + 6.832663552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.992307e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.534851e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.534851e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.878462 sec - 12,930,844,192 cycles:u # 3.310 GHz (75.02%) - 50,945,470 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) - 2,179,183,100 stalled-cycles-backend:u # 16.85% backend cycles idle (75.02%) - 30,864,577,310 instructions:u # 2.39 insn per cycle - # 0.07 stalled cycles per insn (75.02%) - 3.910479605 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.605344e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.116934e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.116934e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.334479 sec + 12,518,471,325 cycles # 2.884 GHz + 30,922,888,427 instructions # 2.47 insn per cycle + 4.354467708 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.583056e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.412060e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.412060e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.156866 sec - 10,403,980,296 cycles:u # 3.267 GHz (74.90%) - 49,718,589 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.89%) - 897,671,639 stalled-cycles-backend:u # 8.63% backend cycles idle (74.97%) - 19,404,180,648 instructions:u # 1.87 insn per cycle - # 0.05 stalled cycles per insn (75.09%) - 3.188892056 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.917239e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660472e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.702387 sec + 10,174,876,030 cycles # 2.745 GHz + 19,548,406,942 instructions # 1.92 insn per cycle + 3.720275920 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.029293e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.888276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.888276e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.515786 sec + 9,723,051,646 cycles # 2.761 GHz + 18,859,468,530 instructions # 1.94 insn per cycle + 3.531121351 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.839848e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.512898e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.512898e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.838759 sec + 8,110,381,366 cycles # 2.110 GHz + 14,814,382,883 instructions # 1.83 insn per cycle + 3.856049832 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 62e74bce4b..385e9ed225 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-28_13:09:36 +DATE: 2024-01-30_04:54:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.914249e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.588634e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.909522e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.630922 sec - 15,311,777,524 cycles:u # 3.287 GHz (74.97%) - 53,797,529 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.97%) - 6,956,915,971 stalled-cycles-backend:u # 45.44% backend cycles idle (74.99%) - 11,493,289,788 instructions:u # 0.75 insn per cycle - # 0.61 stalled cycles per insn (75.02%) - 4.682003984 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.428632e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.291557e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.197877e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.704173 sec + 2,700,513,236 cycles # 2.833 GHz + 4,160,757,344 instructions # 1.54 insn per cycle + 1.040080983 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590284E-002 -Relative difference = 7.67145379496374e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.314424e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512274e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512274e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.509220 sec - 18,660,709,287 cycles:u # 3.370 GHz (74.98%) - 52,180,839 stalled-cycles-frontend:u # 0.28% frontend cycles idle (75.01%) - 66,543,074 stalled-cycles-backend:u # 0.36% backend cycles idle (75.01%) - 44,638,453,582 instructions:u # 2.39 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 5.539870253 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.048898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.230601e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230601e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.414952 sec + 18,538,807,361 cycles # 2.888 GHz + 44,591,647,960 instructions # 2.41 insn per cycle + 6.426389730 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.008406e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.555937e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.555937e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.851213 sec - 12,866,795,762 cycles:u # 3.317 GHz (74.86%) - 55,538,083 stalled-cycles-frontend:u # 0.43% frontend cycles idle (74.97%) - 1,919,872,856 stalled-cycles-backend:u # 14.92% backend cycles idle (75.05%) - 30,173,135,111 instructions:u # 2.35 insn per cycle - # 0.06 stalled cycles per insn (75.05%) - 3.883388788 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.655305e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204388e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204388e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.214890 sec + 12,207,966,974 cycles # 2.892 GHz + 30,217,340,923 instructions # 2.48 insn per cycle + 4.236133486 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.614465e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.455102e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.455102e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.126652 sec - 10,226,352,605 cycles:u # 3.241 GHz (74.91%) - 44,861,835 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.92%) - 260,075,731 stalled-cycles-backend:u # 2.54% backend cycles idle (74.91%) - 19,023,510,957 instructions:u # 1.86 insn per cycle - # 0.01 stalled cycles per insn (74.96%) - 3.158892568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.899712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.627205e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.627205e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.730288 sec + 10,158,219,608 cycles # 2.719 GHz + 19,037,132,874 instructions # 1.87 insn per cycle + 3.746558078 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.048047e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.931283e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.931283e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.492411 sec + 9,571,391,969 cycles # 2.738 GHz + 18,453,150,608 instructions # 1.93 insn per cycle + 3.509341045 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.170414e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.170487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170487e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.317793 sec + 7,240,072,684 cycles # 2.179 GHz + 13,244,781,040 instructions # 1.83 insn per cycle + 3.341198784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index b39b2317cf..2453732bed 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:10:01 +DATE: 2024-01-30_04:55:10 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.775687e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.960414e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.014344e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.057707 sec - 3,244,067,809 cycles:u # 2.994 GHz (74.80%) - 10,927,029 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.60%) - 1,144,622,647 stalled-cycles-backend:u # 35.28% backend cycles idle (75.00%) - 2,968,035,234 instructions:u # 0.91 insn per cycle - # 0.39 stalled cycles per insn (75.25%) - 1.109400147 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.010275e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.133419e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272295e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.538830 sec + 2,187,358,219 cycles # 2.824 GHz + 3,139,905,445 instructions # 1.44 insn per cycle + 0.856073288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.518752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.584131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.584131e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.339121 sec - 14,991,957,756 cycles:u # 3.432 GHz (74.93%) - 8,916,309 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.93%) - 680,019,338 stalled-cycles-backend:u # 4.54% backend cycles idle (74.91%) - 38,746,350,584 instructions:u # 2.58 insn per cycle - # 0.02 stalled cycles per insn (74.99%) - 4.371226147 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.073581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.135755e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135755e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.168677 sec + 14,980,961,047 cycles # 2.896 GHz + 38,724,485,120 instructions # 2.58 insn per cycle + 5.178651966 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.488177e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.712575e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.712575e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.511930 sec - 8,602,900,108 cycles:u # 3.386 GHz (74.90%) - 9,613,119 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.84%) - 200,536,797 stalled-cycles-backend:u # 2.33% backend cycles idle (74.84%) - 24,409,281,538 instructions:u # 2.84 insn per cycle - # 0.01 stalled cycles per insn (75.00%) - 2.544912038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.523460e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.721558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.721558e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090012 sec + 8,952,192,290 cycles # 2.893 GHz + 24,430,503,496 instructions # 2.73 insn per cycle + 3.108451490 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.685094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.280006e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.280006e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.540578 sec - 5,164,515,981 cycles:u # 3.289 GHz (75.04%) - 7,867,047 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.04%) - 1,067,968,522 stalled-cycles-backend:u # 20.68% backend cycles idle (75.04%) - 11,505,183,362 instructions:u # 2.23 insn per cycle - # 0.09 stalled cycles per insn (75.04%) - 1.573597661 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.390626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.850527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.850527e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.056967 sec + 5,535,228,908 cycles # 2.683 GHz + 11,562,552,185 instructions # 2.09 insn per cycle + 2.068379535 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.323214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.965355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.965355e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.769440 sec + 4,825,692,035 cycles # 2.719 GHz + 10,341,008,591 instructions # 2.14 insn per cycle + 1.786949030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.039053e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.289363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.289363e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.707049 sec + 4,944,236,176 cycles # 1.822 GHz + 7,554,838,116 instructions # 1.53 insn per cycle + 2.726854934 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 76ed1fb7b1..adcfa48462 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:45:54 +DATE: 2024-01-30_05:46:39 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.985499e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.802862e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.802862e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.228595 sec - 3,735,435,804 cycles:u # 2.952 GHz (74.95%) - 22,377,197 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.72%) - 1,174,440,900 stalled-cycles-backend:u # 31.44% backend cycles idle (74.73%) - 3,886,989,918 instructions:u # 1.04 insn per cycle - # 0.30 stalled cycles per insn (75.04%) - 1.290048457 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.344134e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.848581e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.848581e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.830257 sec + 3,050,711,174 cycles # 2.837 GHz + 4,744,287,151 instructions # 1.56 insn per cycle + 1.134543078 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.513057e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.577274e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.577274e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.424072 sec - 15,124,386,423 cycles:u # 3.389 GHz (74.91%) - 10,131,846 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.91%) - 795,869,225 stalled-cycles-backend:u # 5.26% backend cycles idle (74.91%) - 38,688,842,712 instructions:u # 2.56 insn per cycle - # 0.02 stalled cycles per insn (75.00%) - 4.465717630 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.051768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112380e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.299158 sec + 15,311,911,023 cycles # 2.886 GHz + 38,783,796,929 instructions # 2.53 insn per cycle + 5.307164517 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.456146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.677718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.677718e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.614372 sec - 8,741,909,741 cycles:u # 3.294 GHz (74.86%) - 9,776,807 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.86%) - 212,474,586 stalled-cycles-backend:u # 2.43% backend cycles idle (74.99%) - 24,606,463,326 instructions:u # 2.81 insn per cycle - # 0.01 stalled cycles per insn (74.99%) - 2.658072308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.466739e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.657869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.657869e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.216841 sec + 9,297,524,138 cycles # 2.885 GHz + 24,613,723,387 instructions # 2.65 insn per cycle + 3.224967553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.584321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.161048e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.161048e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.643539 sec - 5,359,056,860 cycles:u # 3.183 GHz (74.86%) - 8,448,236 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.82%) - 1,100,765,531 stalled-cycles-backend:u # 20.54% backend cycles idle (74.80%) - 11,831,761,142 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (74.82%) - 1.687195550 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.363369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.815752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.815752e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.149413 sec + 5,860,102,645 cycles # 2.720 GHz + 11,849,599,468 instructions # 2.02 insn per cycle + 2.157292568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.162124e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.773170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.773170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.893501 sec + 5,161,881,245 cycles # 2.717 GHz + 10,626,023,875 instructions # 2.06 insn per cycle + 1.901369932 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.945106e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186618e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.853446 sec + 5,298,812,686 cycles # 1.853 GHz + 7,800,536,018 instructions # 1.47 insn per cycle + 2.861501356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 523a7bca51..b23b4b948e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:55:56 +DATE: 2024-01-30_06:00:31 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.818186e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.966197e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.020465e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565155e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155605e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269580e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.054122 sec - 3,218,591,678 cycles:u # 2.968 GHz (74.69%) - 10,707,638 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.92%) - 1,102,035,877 stalled-cycles-backend:u # 34.24% backend cycles idle (74.89%) - 3,000,124,436 instructions:u # 0.93 insn per cycle - # 0.37 stalled cycles per insn (74.87%) - 1.106279619 seconds time elapsed +TOTAL : 0.625620 sec + 2,433,553,514 cycles # 2.839 GHz + 3,531,317,325 instructions # 1.45 insn per cycle + 0.914840106 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.514969e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.580100e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.580100e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.073060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.134707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.134707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.346990 sec - 15,001,586,306 cycles:u # 3.426 GHz (74.97%) - 9,215,025 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 839,349,990 stalled-cycles-backend:u # 5.60% backend cycles idle (74.98%) - 38,768,715,929 instructions:u # 2.58 insn per cycle - # 0.02 stalled cycles per insn (74.98%) - 4.380757819 seconds time elapsed +TOTAL : 5.229961 sec + 15,157,435,498 cycles # 2.896 GHz + 38,739,723,091 instructions # 2.56 insn per cycle + 5.236486145 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.493557e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.718867e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.718867e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.526696e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.723798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723798e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.510137 sec - 8,597,251,783 cycles:u # 3.385 GHz (74.81%) - 9,316,093 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.90%) - 191,978,085 stalled-cycles-backend:u # 2.23% backend cycles idle (75.06%) - 24,324,544,763 instructions:u # 2.83 insn per cycle - # 0.01 stalled cycles per insn (75.12%) - 2.542379153 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.143207 sec + 9,122,833,846 cycles # 2.898 GHz + 24,428,638,513 instructions # 2.68 insn per cycle + 3.149727451 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.691867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.285163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.285163e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.453278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.923487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.923487e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.540808 sec - 5,173,463,246 cycles:u # 3.293 GHz (75.05%) - 8,639,028 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.05%) - 1,060,923,299 stalled-cycles-backend:u # 20.51% backend cycles idle (75.05%) - 11,469,896,836 instructions:u # 2.22 insn per cycle - # 0.09 stalled cycles per insn (75.06%) - 1.573272946 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +TOTAL : 2.094012 sec + 5,713,399,327 cycles # 2.721 GHz + 11,544,398,198 instructions # 2.02 insn per cycle + 2.100575275 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.340982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.000324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.000324e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.826424 sec + 5,007,819,577 cycles # 2.734 GHz + 10,288,512,439 instructions # 2.05 insn per cycle + 1.833139039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.024689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.274198e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.274198e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.778758 sec + 5,115,298,192 cycles # 1.837 GHz + 7,503,411,062 instructions # 1.47 insn per cycle + 2.785395708 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index b10f7871e6..66a621d02a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:54:08 +DATE: 2024-01-30_05:57:01 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.578143e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159887e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277521e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.567703 sec + 2,256,406,335 cycles # 2.832 GHz + 3,552,290,336 instructions # 1.57 insn per cycle + 0.856591173 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 55,592,144 cycles:u # 2.674 GHz (61.55%) - 45,049 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.55%) - 659,446 stalled-cycles-backend:u # 1.19% backend cycles idle (61.55%) - 41,558,480 instructions:u # 0.75 insn per cycle - # 0.02 stalled cycles per insn (63.66%) - 0.021719403 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted - 41,534,797 cycles:u # 2.016 GHz (61.20%) - 59,946 stalled-cycles-frontend:u # 0.14% frontend cycles idle (61.20%) - 347,247 stalled-cycles-backend:u # 0.84% backend cycles idle (61.43%) - 48,334,762 instructions:u # 1.16 insn per cycle - # 0.01 stalled cycles per insn (74.40%) - 0.021856960 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.061569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.123242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123242e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.196625 sec + 14,980,592,489 cycles # 2.880 GHz + 38,723,298,937 instructions # 2.58 insn per cycle + 5.203366404 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted - 41,633,925 cycles:u # 2.004 GHz (61.54%) - 54,713 stalled-cycles-frontend:u # 0.13% frontend cycles idle (61.54%) - 364,326 stalled-cycles-backend:u # 0.88% backend cycles idle (54.46%) - 48,568,253 instructions:u # 1.17 insn per cycle - # 0.01 stalled cycles per insn (73.69%) - 0.021960561 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.518700e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.715553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.715553e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090850 sec + 8,946,489,145 cycles # 2.890 GHz + 24,429,263,818 instructions # 2.73 insn per cycle + 3.097198356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted - 55,539,954 cycles:u # 2.689 GHz (61.30%) - 44,952 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.30%) - 601,127 stalled-cycles-backend:u # 1.08% backend cycles idle (61.30%) - 41,345,033 instructions:u # 0.74 insn per cycle - # 0.01 stalled cycles per insn (63.10%) - 0.021938930 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.476437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.948509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948509e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.025257 sec + 5,523,468,825 cycles # 2.720 GHz + 11,561,737,650 instructions # 2.09 insn per cycle + 2.031752517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.358069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007551e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.759129 sec + 4,801,841,802 cycles # 2.722 GHz + 10,338,992,386 instructions # 2.15 insn per cycle + 1.765685267 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.036811e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.287808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.287808e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.707972 sec + 4,942,835,417 cycles # 1.822 GHz + 7,554,452,946 instructions # 1.53 insn per cycle + 2.714536601 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 69cc2c26c0..defb46a739 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:52:02 +DATE: 2024-01-30_05:53:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.820548e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967686e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.021920e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.171354 sec - 3,639,696,725 cycles:u # 3.021 GHz (74.98%) - 21,809,161 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.77%) - 1,134,646,968 stalled-cycles-backend:u # 31.17% backend cycles idle (74.47%) - 3,841,517,977 instructions:u # 1.06 insn per cycle - # 0.30 stalled cycles per insn (74.69%) - 1.224070359 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.688012e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154108e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269539e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.720498 sec + 2,707,766,521 cycles # 2.848 GHz + 4,278,662,893 instructions # 1.58 insn per cycle + 1.009865256 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.516404e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.581636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.581636e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.343161 sec - 14,989,971,532 cycles:u # 3.428 GHz (74.94%) - 8,919,078 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) - 760,504,082 stalled-cycles-backend:u # 5.07% backend cycles idle (74.95%) - 38,772,327,618 instructions:u # 2.59 insn per cycle - # 0.02 stalled cycles per insn (74.94%) - 4.377763204 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.066193e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.127901e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.127901e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.184582 sec + 14,984,631,159 cycles # 2.888 GHz + 38,723,388,390 instructions # 2.58 insn per cycle + 5.191155299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.495611e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.720881e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.720881e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.509925 sec - 8,593,440,049 cycles:u # 3.382 GHz (74.82%) - 9,282,790 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.86%) - 199,808,787 stalled-cycles-backend:u # 2.33% backend cycles idle (75.02%) - 24,320,330,564 instructions:u # 2.83 insn per cycle - # 0.01 stalled cycles per insn (75.14%) - 2.543748452 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.511860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.708377e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.708377e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.095761 sec + 8,950,231,816 cycles # 2.886 GHz + 24,430,052,071 instructions # 2.73 insn per cycle + 3.102564983 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.687059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.280702e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.280702e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.541096 sec - 5,172,320,908 cycles:u # 3.292 GHz (74.96%) - 8,064,746 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.05%) - 1,064,659,158 stalled-cycles-backend:u # 20.58% backend cycles idle (75.05%) - 11,490,392,245 instructions:u # 2.22 insn per cycle - # 0.09 stalled cycles per insn (75.06%) - 1.573435516 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.454029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.925499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.925499e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.033615 sec + 5,531,582,240 cycles # 2.713 GHz + 11,562,288,179 instructions # 2.09 insn per cycle + 2.040383969 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.327959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.977069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977069e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.768254 sec + 4,816,907,251 cycles # 2.716 GHz + 10,339,308,595 instructions # 2.15 insn per cycle + 1.774968996 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.992436e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.241387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.241387e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.737995 sec + 4,943,973,305 cycles # 1.803 GHz + 7,555,690,658 instructions # 1.53 insn per cycle + 2.744582139 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 0f0d7b7fde..fe6f195aa6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:10:18 +DATE: 2024-01-30_04:55:39 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.753738e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.923588e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.976812e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.054772 sec - 3,211,784,029 cycles:u # 2.968 GHz (74.94%) - 10,780,601 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.90%) - 1,168,643,143 stalled-cycles-backend:u # 36.39% backend cycles idle (75.33%) - 2,939,310,363 instructions:u # 0.92 insn per cycle - # 0.40 stalled cycles per insn (75.31%) - 1.104771986 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.125481e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158117e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273663e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.534485 sec + 2,191,778,134 cycles # 2.834 GHz + 3,140,951,827 instructions # 1.43 insn per cycle + 0.850685752 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.435774e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.497108e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.497108e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.479308 sec - 15,482,569,040 cycles:u # 3.434 GHz (74.98%) - 8,881,462 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 22,813,560 stalled-cycles-backend:u # 0.15% backend cycles idle (74.98%) - 39,561,506,752 instructions:u # 2.56 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 4.510830156 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.109309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.173415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.173415e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.081724 sec + 14,685,294,357 cycles # 2.887 GHz + 39,544,026,748 instructions # 2.69 insn per cycle + 5.093038112 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.395625e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.613119e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.613119e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.560908 sec - 8,744,328,727 cycles:u # 3.376 GHz (74.98%) - 9,294,975 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.98%) - 1,209,598,590 stalled-cycles-backend:u # 13.83% backend cycles idle (74.98%) - 23,579,297,009 instructions:u # 2.70 insn per cycle - # 0.05 stalled cycles per insn (75.01%) - 2.593662543 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.661768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.875473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.875473e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.973090 sec + 8,600,238,365 cycles # 2.886 GHz + 23,576,508,735 instructions # 2.74 insn per cycle + 2.991032269 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.912061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.387522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.387522e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.691589 sec - 5,689,436,332 cycles:u # 3.305 GHz (74.91%) - 9,009,653 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.94%) - 1,010,811,881 stalled-cycles-backend:u # 17.77% backend cycles idle (74.94%) - 13,208,068,159 instructions:u # 2.32 insn per cycle - # 0.08 stalled cycles per insn (74.91%) - 1.724686172 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.966204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.352181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.352181e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.222703 sec + 5,964,350,122 cycles # 2.676 GHz + 13,193,903,385 instructions # 2.21 insn per cycle + 2.290428549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.425705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.897406e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.897406e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.043603 sec + 5,539,021,528 cycles # 2.702 GHz + 12,103,311,893 instructions # 2.19 insn per cycle + 2.060365335 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.662802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.870728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.870728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.974749 sec + 5,366,303,915 cycles # 1.800 GHz + 9,381,926,109 instructions # 1.75 insn per cycle + 2.994553633 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 2f14e3d02f..8cd37966a9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:32:31 +DATE: 2024-01-30_05:35:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.619762e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.965495e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.019770e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.058889 sec - 3,239,800,555 cycles:u # 2.982 GHz (74.98%) - 10,954,603 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.21%) - 1,165,027,980 stalled-cycles-backend:u # 35.96% backend cycles idle (75.14%) - 3,002,700,856 instructions:u # 0.93 insn per cycle - # 0.39 stalled cycles per insn (74.90%) - 1.110182078 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.561376e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154966e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270589e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529335 sec + 2,159,762,911 cycles # 2.829 GHz + 3,107,803,545 instructions # 1.44 insn per cycle + 0.822533200 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.854206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.937254e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.937254e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.850677 sec - 13,292,864,444 cycles:u # 3.425 GHz (74.88%) - 9,458,667 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) - 556,825,469 stalled-cycles-backend:u # 4.19% backend cycles idle (75.06%) - 35,759,647,817 instructions:u # 2.69 insn per cycle - # 0.02 stalled cycles per insn (75.07%) - 3.883732167 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.227004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298943e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298943e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.818572 sec + 13,907,927,893 cycles # 2.883 GHz + 35,849,684,316 instructions # 2.58 insn per cycle + 4.825096940 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.425692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.644208e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.644208e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.546004 sec - 8,703,098,620 cycles:u # 3.378 GHz (74.87%) - 9,150,233 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.85%) - 2,356,849,986 stalled-cycles-backend:u # 27.08% backend cycles idle (74.90%) - 21,880,814,968 instructions:u # 2.51 insn per cycle - # 0.11 stalled cycles per insn (75.06%) - 2.580733666 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.848483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.087109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.087109e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.835129 sec + 8,213,185,511 cycles # 2.892 GHz + 21,908,282,308 instructions # 2.67 insn per cycle + 2.841971377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.667814e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.109083e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.109083e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.749675 sec - 5,907,885,632 cycles:u # 3.319 GHz (74.91%) - 9,115,682 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.87%) - 2,250,445,505 stalled-cycles-backend:u # 38.09% backend cycles idle (74.87%) - 12,110,680,805 instructions:u # 2.05 insn per cycle - # 0.19 stalled cycles per insn (74.84%) - 1.784067388 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.473983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.948336e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948336e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.025782 sec + 5,530,364,572 cycles # 2.723 GHz + 12,076,349,288 instructions # 2.18 insn per cycle + 2.032542267 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.936500e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.499652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.499652e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.876359 sec + 5,112,015,535 cycles # 2.716 GHz + 11,141,551,976 instructions # 2.18 insn per cycle + 1.883163972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.149105e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.416003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.416003e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.637284 sec + 4,829,728,502 cycles # 1.827 GHz + 8,842,382,666 instructions # 1.83 insn per cycle + 2.644418009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 0f4e353ce0..8eec31c0d3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:32:48 +DATE: 2024-01-30_05:36:21 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.791937e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.927379e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.980472e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.054617 sec - 3,224,715,439 cycles:u # 2.971 GHz (74.96%) - 10,815,466 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.96%) - 1,178,327,260 stalled-cycles-backend:u # 36.54% backend cycles idle (74.90%) - 2,958,776,019 instructions:u # 0.92 insn per cycle - # 0.40 stalled cycles per insn (74.87%) - 1.109739959 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.565410e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157958e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274503e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.528794 sec + 2,178,979,969 cycles # 2.840 GHz + 3,111,172,536 instructions # 1.43 insn per cycle + 0.825662442 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.213590e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.319097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.319097e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.439517 sec - 11,845,463,703 cycles:u # 3.413 GHz (74.88%) - 8,526,830 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) - 10,258,082 stalled-cycles-backend:u # 0.09% backend cycles idle (75.09%) - 35,658,260,629 instructions:u # 3.01 insn per cycle - # 0.00 stalled cycles per insn (75.11%) - 3.472955154 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.483554e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.573797e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573797e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.331634 sec + 12,513,147,299 cycles # 2.885 GHz + 35,729,824,625 instructions # 2.86 insn per cycle + 4.338115382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.809965e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.068802e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.068802e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.356282 sec - 8,029,495,156 cycles:u # 3.365 GHz (74.89%) - 9,114,097 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.89%) - 1,760,515,427 stalled-cycles-backend:u # 21.93% backend cycles idle (74.86%) - 21,259,836,160 instructions:u # 2.65 insn per cycle - # 0.08 stalled cycles per insn (75.00%) - 2.390346582 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.944859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.193242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.193242e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.766913 sec + 8,026,265,535 cycles # 2.895 GHz + 21,260,291,484 instructions # 2.65 insn per cycle + 2.773559046 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.910662e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.540729e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.540729e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.502023 sec - 5,018,790,621 cycles:u # 3.276 GHz (74.94%) - 9,171,634 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.98%) - 305,812,820 stalled-cycles-backend:u # 6.09% backend cycles idle (74.72%) - 11,448,661,799 instructions:u # 2.28 insn per cycle - # 0.03 stalled cycles per insn (74.69%) - 1.535931380 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.719292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.240372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.240372e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.943097 sec + 5,300,809,350 cycles # 2.722 GHz + 11,405,959,044 instructions # 2.15 insn per cycle + 1.950186269 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.116224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.720108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.720108e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.828206 sec + 4,977,318,735 cycles # 2.718 GHz + 10,599,506,112 instructions # 2.13 insn per cycle + 1.834822870 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.275159e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.556705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.556705e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.563497 sec + 4,703,376,134 cycles # 1.831 GHz + 8,567,908,292 instructions # 1.82 insn per cycle + 2.570320519 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index e868ff1e3b..03334a40e8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:10:35 +DATE: 2024-01-30_04:56:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.867870e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.949186e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.114682e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.002212 sec - 3,146,193,514 cycles:u # 3.063 GHz (74.09%) - 10,775,596 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.96%) - 1,162,765,320 stalled-cycles-backend:u # 36.96% backend cycles idle (75.10%) - 2,821,273,239 instructions:u # 0.90 insn per cycle - # 0.41 stalled cycles per insn (75.15%) - 1.053695884 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.266078e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.583524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962786e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.486964 sec + 2,022,378,491 cycles # 2.826 GHz + 2,872,554,108 instructions # 1.42 insn per cycle + 0.794836465 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.983975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.073711e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.073711e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.653959 sec - 12,674,685,102 cycles:u # 3.445 GHz (75.00%) - 6,976,259 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) - 10,395,979 stalled-cycles-backend:u # 0.08% backend cycles idle (75.00%) - 37,069,583,951 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 3.681563485 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.220233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.293728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.812282 sec + 13,901,639,181 cycles # 2.885 GHz + 37,078,732,469 instructions # 2.67 insn per cycle + 4.824222975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.084246e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.484646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.484646e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.863981 sec - 6,409,143,938 cycles:u # 3.392 GHz (74.92%) - 6,864,416 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.03%) - 2,243,634,732 stalled-cycles-backend:u # 35.01% backend cycles idle (75.03%) - 15,212,303,834 instructions:u # 2.37 insn per cycle - # 0.15 stalled cycles per insn (75.03%) - 1.893254518 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.150516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.595808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.595808e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.124737 sec + 6,168,101,005 cycles # 2.895 GHz + 15,212,489,109 instructions # 2.47 insn per cycle + 2.142108549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.220620e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.377299e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.377299e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.999080 sec - 3,381,583,250 cycles:u # 3.300 GHz (74.93%) - 6,760,789 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.02%) - 924,609,270 stalled-cycles-backend:u # 27.34% backend cycles idle (75.02%) - 7,662,389,776 instructions:u # 2.27 insn per cycle - # 0.12 stalled cycles per insn (75.03%) - 1.028114549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.954385e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029179e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.259990 sec + 3,437,290,204 cycles # 2.715 GHz + 7,715,643,345 instructions # 2.24 insn per cycle + 1.287994689 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.805420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.144112e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144112e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.162105 sec + 3,179,163,625 cycles # 2.727 GHz + 7,109,925,739 instructions # 2.24 insn per cycle + 1.178171652 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.071814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.862424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.862424e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.572617 sec + 2,980,157,633 cycles # 1.888 GHz + 5,763,820,562 instructions # 1.93 insn per cycle + 1.590552097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 63d5e71b58..3a80a864ae 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:46:11 +DATE: 2024-01-30_05:47:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.468362e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.052225e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.052225e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.151346 sec - 3,578,976,244 cycles:u # 3.030 GHz (74.85%) - 21,174,126 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.99%) - 1,154,032,876 stalled-cycles-backend:u # 32.24% backend cycles idle (75.02%) - 3,915,654,276 instructions:u # 1.09 insn per cycle - # 0.29 stalled cycles per insn (74.86%) - 1.202695970 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.753522e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.358863e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.358863e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.684539 sec + 2,591,525,623 cycles # 2.839 GHz + 3,989,244,311 instructions # 1.54 insn per cycle + 0.972564077 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.973829e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.065181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.065181e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.706344 sec - 12,727,386,308 cycles:u # 3.407 GHz (74.96%) - 6,881,823 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) - 22,282,019 stalled-cycles-backend:u # 0.18% backend cycles idle (74.97%) - 37,158,211,527 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 3.739232598 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.212668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.285744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.285744e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.870798 sec + 14,070,285,227 cycles # 2.885 GHz + 37,122,197,019 instructions # 2.64 insn per cycle + 4.878379515 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.050687e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.438614e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.438614e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.919778 sec - 6,485,492,313 cycles:u # 3.325 GHz (74.99%) - 7,563,911 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.99%) - 2,203,791,666 stalled-cycles-backend:u # 33.98% backend cycles idle (74.99%) - 15,490,081,510 instructions:u # 2.39 insn per cycle - # 0.14 stalled cycles per insn (75.02%) - 1.954573380 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.080420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.515170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.515170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.198766 sec + 6,358,773,769 cycles # 2.884 GHz + 15,492,113,204 instructions # 2.44 insn per cycle + 2.206392318 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.208801e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.362492e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.362492e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.052643 sec - 3,440,201,075 cycles:u # 3.174 GHz (75.04%) - 6,842,051 stalled-cycles-frontend:u # 0.20% frontend cycles idle (74.91%) - 935,859,186 stalled-cycles-backend:u # 27.20% backend cycles idle (74.91%) - 7,927,062,593 instructions:u # 2.30 insn per cycle - # 0.12 stalled cycles per insn (74.96%) - 1.087632583 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.787706e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.007873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007873e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.328722 sec + 3,633,771,509 cycles # 2.722 GHz + 7,954,097,743 instructions # 2.19 insn per cycle + 1.336366634 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.612179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.118037e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.118037e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.225517 sec + 3,366,927,421 cycles # 2.733 GHz + 7,347,508,752 instructions # 2.18 insn per cycle + 1.232992993 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.960005e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.722467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.722467e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.642570 sec + 3,181,631,608 cycles # 1.930 GHz + 6,021,725,956 instructions # 1.89 insn per cycle + 1.650041277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 45b49ea418..38a7216065 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:56:13 +DATE: 2024-01-30_06:00:59 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.259857e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.954473e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.117418e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.005687 sec - 3,069,314,095 cycles:u # 2.975 GHz (75.12%) - 10,688,770 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.21%) - 1,145,250,134 stalled-cycles-backend:u # 37.31% backend cycles idle (75.26%) - 2,907,743,689 instructions:u # 0.95 insn per cycle - # 0.39 stalled cycles per insn (75.15%) - 1.053740664 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.412461e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.631522e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951868e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.573986 sec + 2,244,851,144 cycles # 2.822 GHz + 3,300,445,554 instructions # 1.47 insn per cycle + 0.853607464 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.984371e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.073750e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.073750e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.654664 sec - 12,668,038,640 cycles:u # 3.442 GHz (75.00%) - 6,895,601 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.00%) - 10,315,591 stalled-cycles-backend:u # 0.08% backend cycles idle (75.01%) - 37,055,294,095 instructions:u # 2.93 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 3.682293734 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.218192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.291861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291861e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.876695 sec + 14,064,697,494 cycles # 2.884 GHz + 37,110,369,611 instructions # 2.64 insn per cycle + 4.882981134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.085800e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.485279e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.485279e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.863877 sec - 6,385,234,513 cycles:u # 3.379 GHz (75.02%) - 6,627,854 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.02%) - 2,211,355,000 stalled-cycles-backend:u # 34.63% backend cycles idle (75.02%) - 15,199,909,839 instructions:u # 2.38 insn per cycle - # 0.15 stalled cycles per insn (75.03%) - 1.891750968 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.131220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.575839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.575839e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 +TOTAL : 2.187823 sec + 6,322,431,284 cycles # 2.883 GHz + 15,223,876,723 instructions # 2.41 insn per cycle + 2.194184928 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.222307e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379467e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.999380 sec - 3,374,682,598 cycles:u # 3.292 GHz (74.90%) - 6,664,818 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.03%) - 923,960,136 stalled-cycles-backend:u # 27.38% backend cycles idle (75.03%) - 7,662,101,331 instructions:u # 2.27 insn per cycle - # 0.12 stalled cycles per insn (75.04%) - 1.027286724 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.948892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027773e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.320835 sec + 3,601,071,923 cycles # 2.719 GHz + 7,699,828,133 instructions # 2.14 insn per cycle + 1.327138068 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.790537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.142626e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.218690 sec + 3,342,798,362 cycles # 2.731 GHz + 7,059,572,278 instructions # 2.11 insn per cycle + 1.225217680 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.022088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.806836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.806836e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.641075 sec + 3,147,503,652 cycles # 1.912 GHz + 5,713,849,148 instructions # 1.82 insn per cycle + 1.647331874 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 269f84a482..cb54d3236b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:54:16 +DATE: 2024-01-30_05:57:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 50,492,933 cycles:u # 2.431 GHz (61.51%) - 48,157 stalled-cycles-frontend:u # 0.10% frontend cycles idle (61.51%) - 587,577 stalled-cycles-backend:u # 1.16% backend cycles idle (61.51%) - 46,402,407 instructions:u # 0.92 insn per cycle - # 0.01 stalled cycles per insn (63.40%) - 0.021670098 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.414196e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.655173e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981062e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.514856 sec + 2,089,539,478 cycles # 2.840 GHz + 3,296,506,746 instructions # 1.58 insn per cycle + 0.794530995 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted - 39,894,533 cycles:u # 1.933 GHz (61.27%) - 104,234 stalled-cycles-frontend:u # 0.26% frontend cycles idle (61.27%) - 371,226 stalled-cycles-backend:u # 0.93% backend cycles idle (56.66%) - 48,727,655 instructions:u # 1.22 insn per cycle - # 0.01 stalled cycles per insn (76.02%) - 0.021936465 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.227183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.300870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.300870e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.796202 sec + 13,896,514,461 cycles # 2.894 GHz + 37,078,595,071 instructions # 2.67 insn per cycle + 4.803618427 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted - 55,699,986 cycles:u # 2.705 GHz (61.18%) - 44,852 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.19%) - 642,472 stalled-cycles-backend:u # 1.15% backend cycles idle (61.19%) - 41,318,724 instructions:u # 0.74 insn per cycle - # 0.02 stalled cycles per insn (63.02%) - 0.021877861 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.077190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.527451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.527451e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.154367 sec + 6,177,704,022 cycles # 2.870 GHz + 15,215,532,210 instructions # 2.46 insn per cycle + 2.160620609 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted - 55,921,394 cycles:u # 2.729 GHz (60.99%) - 45,186 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.00%) - 616,775 stalled-cycles-backend:u # 1.10% backend cycles idle (61.00%) - 41,283,104 instructions:u # 0.74 insn per cycle - # 0.01 stalled cycles per insn (62.83%) - 0.021782505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.911398e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.023141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.023141e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.265022 sec + 3,447,761,650 cycles # 2.714 GHz + 7,715,058,636 instructions # 2.24 insn per cycle + 1.271511064 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.829060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147412e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.155347 sec + 3,170,001,813 cycles # 2.731 GHz + 7,109,524,161 instructions # 2.24 insn per cycle + 1.161808340 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.999480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.774350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.774350e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.586909 sec + 2,978,718,352 cycles # 1.871 GHz + 5,762,941,941 instructions # 1.93 insn per cycle + 1.593095591 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 802f24068e..5939268227 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:52:19 +DATE: 2024-01-30_05:54:04 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.199018e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.934663e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.097479e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.125978 sec - 3,517,472,024 cycles:u # 3.046 GHz (75.08%) - 21,795,711 stalled-cycles-frontend:u # 0.62% frontend cycles idle (75.13%) - 1,145,098,111 stalled-cycles-backend:u # 32.55% backend cycles idle (75.10%) - 3,784,048,433 instructions:u # 1.08 insn per cycle - # 0.30 stalled cycles per insn (75.00%) - 1.174336253 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.468280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.632924e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.955888e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.625640 sec + 2,402,533,839 cycles # 2.841 GHz + 3,758,306,095 instructions # 1.56 insn per cycle + 0.905223049 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.981458e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.070741e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.070741e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.658653 sec - 12,683,469,123 cycles:u # 3.443 GHz (75.00%) - 6,818,041 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) - 10,612,246 stalled-cycles-backend:u # 0.08% backend cycles idle (75.03%) - 37,048,878,080 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.686174764 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.221197e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.294941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294941e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.809027 sec + 13,889,421,482 cycles # 2.885 GHz + 37,078,742,557 instructions # 2.67 insn per cycle + 4.815296717 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.083752e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.480232e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.480232e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.864743 sec - 6,386,352,954 cycles:u # 3.378 GHz (75.03%) - 6,620,904 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.03%) - 2,207,737,284 stalled-cycles-backend:u # 34.57% backend cycles idle (75.04%) - 15,202,737,268 instructions:u # 2.38 insn per cycle - # 0.15 stalled cycles per insn (75.04%) - 1.892656081 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.146065e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.592205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.592205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.125330 sec + 6,161,438,553 cycles # 2.892 GHz + 15,211,397,983 instructions # 2.47 insn per cycle + 2.131726868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221796e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.378717e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.378717e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.000149 sec - 3,377,486,531 cycles:u # 3.292 GHz (74.84%) - 6,684,712 stalled-cycles-frontend:u # 0.20% frontend cycles idle (75.05%) - 924,062,070 stalled-cycles-backend:u # 27.36% backend cycles idle (75.05%) - 7,659,777,132 instructions:u # 2.27 insn per cycle - # 0.12 stalled cycles per insn (75.06%) - 1.027697884 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.991330e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.034099e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034099e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.255088 sec + 3,440,029,043 cycles # 2.730 GHz + 7,714,775,848 instructions # 2.24 insn per cycle + 1.261283713 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.843583e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.149362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149362e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.153935 sec + 3,172,826,861 cycles # 2.738 GHz + 7,109,210,779 instructions # 2.24 insn per cycle + 1.160268530 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.077925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.872855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.872855e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.570762 sec + 2,979,903,068 cycles # 1.891 GHz + 5,762,829,882 instructions # 1.93 insn per cycle + 1.577195857 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 042e3de501..c96a0bb3db 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:10:50 +DATE: 2024-01-30_04:56:32 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.278205e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.122561e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.307625e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.000858 sec - 3,085,382,162 cycles:u # 3.007 GHz (75.37%) - 10,612,277 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.18%) - 1,158,398,012 stalled-cycles-backend:u # 37.54% backend cycles idle (75.07%) - 2,777,468,294 instructions:u # 0.90 insn per cycle - # 0.42 stalled cycles per insn (75.12%) - 1.053344191 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.421312e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.704045e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.041754e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.486618 sec + 2,018,521,842 cycles # 2.827 GHz + 2,837,894,141 instructions # 1.41 insn per cycle + 0.795623791 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.960614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.049279e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.049279e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.682401 sec - 12,779,950,462 cycles:u # 3.447 GHz (74.97%) - 7,147,335 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 13,342,030 stalled-cycles-backend:u # 0.10% backend cycles idle (74.97%) - 37,437,661,838 instructions:u # 2.93 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 3.710552831 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.245629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320181e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.758439 sec + 13,805,800,630 cycles # 2.898 GHz + 37,480,161,839 instructions # 2.71 insn per cycle + 4.770650257 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.316599e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.898743e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.898743e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.573407 sec - 5,370,146,557 cycles:u # 3.358 GHz (74.99%) - 7,049,522 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.99%) - 1,296,967,325 stalled-cycles-backend:u # 24.15% backend cycles idle (74.99%) - 15,246,844,666 instructions:u # 2.84 insn per cycle - # 0.09 stalled cycles per insn (74.77%) - 1.602465611 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.821274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.398672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.398672e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.889404 sec + 5,475,292,589 cycles # 2.889 GHz + 15,244,893,114 instructions # 2.78 insn per cycle + 1.908184587 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.871601e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.683658e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.683658e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.322244 sec - 4,487,840,490 cycles:u # 3.330 GHz (75.03%) - 6,293,982 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.07%) - 1,662,737,954 stalled-cycles-backend:u # 37.05% backend cycles idle (75.08%) - 9,797,942,949 instructions:u # 2.18 insn per cycle - # 0.17 stalled cycles per insn (75.10%) - 1.351229383 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.385813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037637e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.731302 sec + 4,719,001,422 cycles # 2.717 GHz + 9,850,811,081 instructions # 2.09 insn per cycle + 1.750777348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186428369954 -Relative difference = 1.7604478492421832e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.683577e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.409489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.409489e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.659075 sec + 4,492,699,411 cycles # 2.699 GHz + 9,202,452,349 instructions # 2.05 insn per cycle + 1.671352513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.486110e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.486110e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.854854 sec + 3,463,720,216 cycles # 1.861 GHz + 6,875,040,962 instructions # 1.98 insn per cycle + 1.876340349 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183217635378 +Relative difference = 1.5859655131013432e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index bf507682ad..993f4107d6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:33:04 +DATE: 2024-01-30_05:36:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.113239e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.954105e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.119229e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.004793 sec - 3,157,964,934 cycles:u # 3.065 GHz (74.05%) - 10,832,495 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.62%) - 1,159,454,099 stalled-cycles-backend:u # 36.72% backend cycles idle (74.91%) - 2,839,370,366 instructions:u # 0.90 insn per cycle - # 0.41 stalled cycles per insn (75.16%) - 1.054666781 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.377362e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649325e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.974675e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.484150 sec + 2,005,186,574 cycles # 2.831 GHz + 2,872,226,914 instructions # 1.43 insn per cycle + 0.768013554 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.217981e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.322339e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.322339e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.400341 sec - 11,783,062,686 cycles:u # 3.439 GHz (74.97%) - 6,747,931 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.02%) - 1,691,071,194 stalled-cycles-backend:u # 14.35% backend cycles idle (75.02%) - 34,217,886,896 instructions:u # 2.90 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 3.428596526 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.479081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.570421e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570421e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.318821 sec + 12,411,469,267 cycles # 2.871 GHz + 34,216,954,204 instructions # 2.76 insn per cycle + 4.325006925 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.202361e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.763493e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.763493e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.597461 sec - 5,462,019,287 cycles:u # 3.364 GHz (74.97%) - 7,524,632 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.88%) - 2,037,874,394 stalled-cycles-backend:u # 37.31% backend cycles idle (74.88%) - 14,603,291,454 instructions:u # 2.67 insn per cycle - # 0.14 stalled cycles per insn (74.92%) - 1.627778417 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.935196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.540988e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.540988e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.856505 sec + 5,363,525,325 cycles # 2.881 GHz + 14,587,825,944 instructions # 2.72 insn per cycle + 1.863141926 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198769558221 -Relative difference = 6.06481491495597e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288192580919713 +Relative difference = 1.2721291123071246e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.440783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035053e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.252142 sec - 4,260,468,182 cycles:u # 3.332 GHz (75.02%) - 6,952,166 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.98%) - 1,642,503,426 stalled-cycles-backend:u # 38.55% backend cycles idle (74.98%) - 9,034,279,045 instructions:u # 2.12 insn per cycle - # 0.18 stalled cycles per insn (74.98%) - 1.281904591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.475828e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.385170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.385170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.492139 sec + 4,058,079,431 cycles # 2.710 GHz + 9,088,895,483 instructions # 2.24 insn per cycle + 1.498802038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186752004549 -Relative difference = 1.6009291367898262e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.052179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.125609e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.125609e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.390912 sec + 3,795,132,868 cycles # 2.718 GHz + 8,440,638,214 instructions # 2.22 insn per cycle + 1.397579629 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.426211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.889827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.889827e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.021883 sec + 3,727,709,927 cycles # 1.839 GHz + 7,572,021,248 instructions # 2.03 insn per cycle + 2.028341317 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183350348845 +Relative difference = 1.6513796936156652e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 13812e3523..2891f046ff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:33:19 +DATE: 2024-01-30_05:37:12 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.130970e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.126031e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.311298e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.003536 sec - 3,039,083,138 cycles:u # 2.955 GHz (75.12%) - 10,598,068 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.18%) - 1,141,275,534 stalled-cycles-backend:u # 37.55% backend cycles idle (75.14%) - 2,935,198,243 instructions:u # 0.97 insn per cycle - # 0.39 stalled cycles per insn (75.10%) - 1.053839204 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.485748e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689974e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.027356e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.482102 sec + 1,996,662,355 cycles # 2.812 GHz + 2,850,200,230 instructions # 1.43 insn per cycle + 0.768087139 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.441941e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.561536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.561536e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.188009 sec - 11,063,045,419 cycles:u # 3.441 GHz (74.77%) - 7,294,655 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.95%) - 249,077,227 stalled-cycles-backend:u # 2.25% backend cycles idle (75.08%) - 35,391,614,020 instructions:u # 3.20 insn per cycle - # 0.01 stalled cycles per insn (75.13%) - 3.217485797 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.596095e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.696763e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.696763e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.127183 sec + 11,946,394,247 cycles # 2.891 GHz + 35,407,075,530 instructions # 2.96 insn per cycle + 4.133301161 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.813144e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.487452e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.487452e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.483390 sec - 5,089,681,195 cycles:u # 3.371 GHz (74.69%) - 7,194,060 stalled-cycles-frontend:u # 0.14% frontend cycles idle (74.89%) - 1,337,761,295 stalled-cycles-backend:u # 26.28% backend cycles idle (75.10%) - 14,066,053,324 instructions:u # 2.76 insn per cycle - # 0.10 stalled cycles per insn (75.10%) - 1.513530765 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.250434e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.927787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.927787e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.766919 sec + 5,069,845,731 cycles # 2.861 GHz + 14,044,971,447 instructions # 2.77 insn per cycle + 1.773365949 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198892958462 -Relative difference = 5.4565783974899003e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288192554144189 +Relative difference = 1.2589315209891237e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.018818e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.125746e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.125746e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.170469 sec - 3,966,855,118 cycles:u # 3.314 GHz (74.73%) - 6,278,145 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.66%) - 1,453,930,293 stalled-cycles-backend:u # 36.65% backend cycles idle (74.78%) - 8,622,552,846 instructions:u # 2.17 insn per cycle - # 0.17 stalled cycles per insn (75.11%) - 1.200791899 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.492213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492213e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.476405 sec + 3,988,953,115 cycles # 2.692 GHz + 8,629,569,798 instructions # 2.16 insn per cycle + 1.482936821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186836987734 -Relative difference = 1.559041129563128e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.210818e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.331985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.331985e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.366166 sec + 3,694,176,022 cycles # 2.694 GHz + 8,100,845,822 instructions # 2.19 insn per cycle + 1.372646371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.670710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.170464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.170464e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.938240 sec + 3,580,879,514 cycles # 1.843 GHz + 7,373,942,234 instructions # 2.06 insn per cycle + 1.944698982 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183569209650 +Relative difference = 1.7592557106041962e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 0930c22334..26cb412a69 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:11:05 +DATE: 2024-01-30_04:56:57 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.814549e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.027041e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.083024e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.055290 sec - 3,235,603,215 cycles:u # 2.990 GHz (74.57%) - 10,735,758 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.04%) - 1,166,700,197 stalled-cycles-backend:u # 36.06% backend cycles idle (75.03%) - 2,957,826,988 instructions:u # 0.91 insn per cycle - # 0.39 stalled cycles per insn (74.81%) - 1.108487534 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.567190e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153367e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271156e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.531785 sec + 2,166,596,506 cycles # 2.818 GHz + 3,096,992,570 instructions # 1.43 insn per cycle + 0.839064322 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.475956e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.539042e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.539042e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.410605 sec - 15,235,239,464 cycles:u # 3.432 GHz (74.95%) - 8,950,019 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.95%) - 122,481,296 stalled-cycles-backend:u # 0.80% backend cycles idle (74.97%) - 39,339,585,883 instructions:u # 2.58 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 4.441724700 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.035137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.096372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096372e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.263186 sec + 15,248,441,904 cycles # 2.894 GHz + 39,293,765,746 instructions # 2.58 insn per cycle + 5.273287972 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.568089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.801322e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.801322e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.470309 sec - 8,452,701,924 cycles:u # 3.382 GHz (74.89%) - 8,675,090 stalled-cycles-frontend:u # 0.10% frontend cycles idle (75.01%) - 888,546,333 stalled-cycles-backend:u # 10.51% backend cycles idle (75.03%) - 24,040,285,320 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 2.502796253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.565129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766484e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.050997 sec + 8,847,131,595 cycles # 2.894 GHz + 24,093,216,326 instructions # 2.72 insn per cycle + 3.069927720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.862178e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.488513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.488513e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.509496 sec - 5,043,612,791 cycles:u # 3.278 GHz (75.04%) - 8,437,371 stalled-cycles-frontend:u # 0.17% frontend cycles idle (75.04%) - 464,861,661 stalled-cycles-backend:u # 9.22% backend cycles idle (75.05%) - 11,420,957,693 instructions:u # 2.26 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 1.542209096 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.446912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.914435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.914435e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.035886 sec + 5,501,574,982 cycles # 2.694 GHz + 11,449,152,902 instructions # 2.08 insn per cycle + 2.052044507 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.398707e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.055840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.750074 sec + 4,773,598,492 cycles # 2.718 GHz + 10,317,257,525 instructions # 2.16 insn per cycle + 1.763056572 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.115786e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.377584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.377584e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.659793 sec + 4,851,599,101 cycles # 1.820 GHz + 7,367,812,046 instructions # 1.52 insn per cycle + 2.678537528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index a8013babf3..3aadf8f9be 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-28_13:11:22 +DATE: 2024-01-30_04:57:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.791588e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.919510e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.972449e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.055156 sec - 3,238,130,925 cycles:u # 2.989 GHz (75.02%) - 10,839,877 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.96%) - 1,162,562,236 stalled-cycles-backend:u # 35.90% backend cycles idle (74.84%) - 3,024,381,388 instructions:u # 0.93 insn per cycle - # 0.38 stalled cycles per insn (74.94%) - 1.105185059 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.571537e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158030e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273800e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.527677 sec + 2,187,527,722 cycles # 2.838 GHz + 3,113,906,107 instructions # 1.42 insn per cycle + 0.843196902 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.428646e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.488494e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.488494e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.492644 sec - 15,561,820,164 cycles:u # 3.440 GHz (74.89%) - 9,743,080 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) - 22,856,560 stalled-cycles-backend:u # 0.15% backend cycles idle (74.99%) - 40,034,760,935 instructions:u # 2.57 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 4.526018788 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.053597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.114429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.114429e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.216554 sec + 15,076,935,035 cycles # 2.887 GHz + 40,115,062,840 instructions # 2.66 insn per cycle + 5.225437216 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.507612e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.734439e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.734439e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.501251 sec - 8,526,186,614 cycles:u # 3.370 GHz (75.02%) - 9,200,816 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.02%) - 688,799,308 stalled-cycles-backend:u # 8.08% backend cycles idle (75.03%) - 23,521,583,317 instructions:u # 2.76 insn per cycle - # 0.03 stalled cycles per insn (74.89%) - 2.534137739 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.498695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.695294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695294e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.107625 sec + 8,698,982,275 cycles # 2.794 GHz + 23,534,504,437 instructions # 2.71 insn per cycle + 3.124975720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.849652e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.322321e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.322321e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.706554 sec - 5,746,389,955 cycles:u # 3.311 GHz (74.90%) - 8,537,946 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.11%) - 832,559,663 stalled-cycles-backend:u # 14.49% backend cycles idle (75.12%) - 13,045,445,119 instructions:u # 2.27 insn per cycle - # 0.06 stalled cycles per insn (75.12%) - 1.739374039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.826638e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.191418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.191418e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.282934 sec + 6,198,059,216 cycles # 2.708 GHz + 13,103,377,766 instructions # 2.11 insn per cycle + 2.300648997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.224417e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.653642e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.653642e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.117622 sec + 5,754,647,700 cycles # 2.709 GHz + 12,210,180,073 instructions # 2.12 insn per cycle + 2.133681313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.752218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971190e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.905718 sec + 5,261,261,771 cycles # 1.807 GHz + 8,449,535,603 instructions # 1.61 insn per cycle + 2.918034623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 03468ae98d..93e04f110e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:11:39 +DATE: 2024-01-30_04:57:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.909982e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079054e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.084818e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.529315 sec - 1,559,559,865 cycles:u # 2.820 GHz (76.06%) - 8,418,204 stalled-cycles-frontend:u # 0.54% frontend cycles idle (75.56%) - 291,021,945 stalled-cycles-backend:u # 18.66% backend cycles idle (75.65%) - 1,867,270,999 instructions:u # 1.20 insn per cycle - # 0.16 stalled cycles per insn (75.83%) - 0.572500015 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.751466e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044991e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059567e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.471853 sec + 1,938,631,197 cycles # 2.818 GHz + 2,775,429,754 instructions # 1.43 insn per cycle + 0.768838740 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.605181e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.840624e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.846217e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.133892 sec - 3,465,794,559 cycles:u # 2.981 GHz (75.14%) - 21,055,335 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.22%) - 852,829,869 stalled-cycles-backend:u # 24.61% backend cycles idle (75.30%) - 3,181,983,608 instructions:u # 0.92 insn per cycle - # 0.27 stalled cycles per insn (75.22%) - 1.184714650 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.083310e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323559e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.337755e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.612248 sec + 2,402,912,694 cycles # 2.815 GHz + 3,669,599,520 instructions # 1.53 insn per cycle + 0.914185147 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.953719e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.966067e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.966067e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.570588 sec - 19,578,003,721 cycles:u # 3.501 GHz (74.96%) - 2,464,679 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 3,426,051,752 stalled-cycles-backend:u # 17.50% backend cycles idle (74.96%) - 57,962,345,246 instructions:u # 2.96 insn per cycle - # 0.06 stalled cycles per insn (74.92%) - 5.594468757 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.436781e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.449292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.449292e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.748293 sec + 19,527,368,133 cycles # 2.892 GHz + 57,921,410,950 instructions # 2.97 insn per cycle + 6.756473501 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.034955e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.085881e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.085881e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.738430 sec - 9,645,335,215 cycles:u # 3.494 GHz (74.88%) - 2,693,840 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) - 2,377,670,357 stalled-cycles-backend:u # 24.65% backend cycles idle (75.08%) - 29,965,191,092 instructions:u # 3.11 insn per cycle - # 0.08 stalled cycles per insn (75.08%) - 2.769354637 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.689715e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.736371e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.736371e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.517761 sec + 10,204,769,485 cycles # 2.897 GHz + 29,944,325,485 instructions # 2.93 insn per cycle + 3.533017528 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.230860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.251937e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.251937e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.354434 sec - 4,773,116,973 cycles:u # 3.469 GHz (75.00%) - 2,265,734 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.00%) - 1,485,397,434 stalled-cycles-backend:u # 31.12% backend cycles idle (75.01%) - 11,220,229,221 instructions:u # 2.35 insn per cycle - # 0.13 stalled cycles per insn (75.01%) - 1.379111913 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.110539e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.290286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.290286e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.822880 sec + 4,929,256,319 cycles # 2.697 GHz + 11,212,094,634 instructions # 2.27 insn per cycle + 1.842452367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.045459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.068242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068242e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.591153 sec + 4,310,771,194 cycles # 2.701 GHz + 10,188,135,001 instructions # 2.36 insn per cycle + 1.604477930 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.350984e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.465337e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.465337e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.255127 sec + 3,913,955,092 cycles # 1.732 GHz + 5,709,470,043 instructions # 1.46 insn per cycle + 2.269083887 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index ccdc768c3e..ec4707eb36 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:46:27 +DATE: 2024-01-30_05:47:33 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.491045e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.013681e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.013681e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.559439 sec - 1,647,632,431 cycles:u # 2.822 GHz (75.24%) - 9,693,930 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.34%) - 268,139,323 stalled-cycles-backend:u # 16.27% backend cycles idle (75.81%) - 2,067,531,007 instructions:u # 1.25 insn per cycle - # 0.13 stalled cycles per insn (75.35%) - 0.606453683 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.528893e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.736864e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.736864e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.499721 sec + 2,019,319,467 cycles # 2.834 GHz + 3,049,308,251 instructions # 1.51 insn per cycle + 0.770515897 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.195881e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.673576e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.673576e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.261679 sec - 3,823,358,268 cycles:u # 2.940 GHz (74.73%) - 29,374,830 stalled-cycles-frontend:u # 0.77% frontend cycles idle (74.85%) - 865,565,211 stalled-cycles-backend:u # 22.64% backend cycles idle (75.10%) - 3,922,189,768 instructions:u # 1.03 insn per cycle - # 0.22 stalled cycles per insn (75.36%) - 1.320853639 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.631733e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.469522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.469522e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.838079 sec + 3,105,645,423 cycles # 2.841 GHz + 4,885,001,867 instructions # 1.57 insn per cycle + 1.151319170 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.938839e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.951076e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951076e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.602477 sec - 19,673,576,110 cycles:u # 3.498 GHz (74.97%) - 2,098,950 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 3,431,015,726 stalled-cycles-backend:u # 17.44% backend cycles idle (74.97%) - 57,895,900,388 instructions:u # 2.94 insn per cycle - # 0.06 stalled cycles per insn (74.98%) - 5.626300208 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.430928e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.443345e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.443345e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.772792 sec + 19,550,332,735 cycles # 2.885 GHz + 57,928,238,854 instructions # 2.96 insn per cycle + 6.778111068 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.032220e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.083502e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.083502e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.743979 sec - 9,644,824,281 cycles:u # 3.487 GHz (74.86%) - 2,560,504 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.92%) - 2,342,809,521 stalled-cycles-backend:u # 24.29% backend cycles idle (75.05%) - 29,999,847,851 instructions:u # 3.11 insn per cycle - # 0.08 stalled cycles per insn (75.13%) - 2.769342677 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.642090e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.688492e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.688492e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.562259 sec + 10,259,962,003 cycles # 2.883 GHz + 29,997,071,393 instructions # 2.92 insn per cycle + 3.567805037 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.237326e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.258636e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.258636e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.351956 sec - 4,773,967,453 cycles:u # 3.475 GHz (75.05%) - 2,106,133 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.97%) - 1,427,762,785 stalled-cycles-backend:u # 29.91% backend cycles idle (74.97%) - 11,253,219,332 instructions:u # 2.36 insn per cycle - # 0.13 stalled cycles per insn (74.97%) - 1.377175051 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.060333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.240360e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.240360e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.842606 sec + 4,975,429,359 cycles # 2.695 GHz + 11,262,132,806 instructions # 2.26 insn per cycle + 1.848498494 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.064837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064837e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.605579 sec + 4,356,497,896 cycles # 2.706 GHz + 10,236,092,665 instructions # 2.35 insn per cycle + 1.611218031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.341333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.457820e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.457820e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.270029 sec + 3,960,771,261 cycles # 1.743 GHz + 5,748,864,563 instructions # 1.45 insn per cycle + 2.275659808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 817e65b1a1..e0fcb209a0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:11:59 +DATE: 2024-01-30_04:58:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.918580e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.072340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.077515e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.526241 sec - 1,548,968,841 cycles:u # 2.807 GHz (75.22%) - 8,147,567 stalled-cycles-frontend:u # 0.53% frontend cycles idle (75.22%) - 291,697,457 stalled-cycles-backend:u # 18.83% backend cycles idle (75.50%) - 1,830,421,358 instructions:u # 1.18 insn per cycle - # 0.16 stalled cycles per insn (75.63%) - 0.571689310 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.715814e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042075e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056833e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.470883 sec + 1,939,912,503 cycles # 2.822 GHz + 2,790,884,564 instructions # 1.44 insn per cycle + 0.765236939 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.536043e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.811270e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.816262e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.126313 sec - 3,458,121,082 cycles:u # 2.996 GHz (74.98%) - 21,160,316 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.19%) - 854,292,971 stalled-cycles-backend:u # 24.70% backend cycles idle (75.11%) - 3,174,894,361 instructions:u # 0.92 insn per cycle - # 0.27 stalled cycles per insn (75.06%) - 1.176797069 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.074401e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309128e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323134e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.606508 sec + 2,399,848,951 cycles # 2.837 GHz + 3,558,977,452 instructions # 1.48 insn per cycle + 0.907497861 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.924382e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.936482e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.936482e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.625968 sec - 19,768,241,213 cycles:u # 3.501 GHz (74.93%) - 2,655,591 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 3,029,751,102 stalled-cycles-backend:u # 15.33% backend cycles idle (74.96%) - 57,778,170,108 instructions:u # 2.92 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 5.649495914 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.442527e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.455052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.455052e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.732470 sec + 19,518,863,765 cycles # 2.898 GHz + 57,747,544,085 instructions # 2.96 insn per cycle + 6.739693684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.953726e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.003371e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.003371e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.775191 sec - 9,776,032,539 cycles:u # 3.496 GHz (74.88%) - 2,389,136 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) - 2,294,856,841 stalled-cycles-backend:u # 23.47% backend cycles idle (74.89%) - 30,399,307,141 instructions:u # 3.11 insn per cycle - # 0.08 stalled cycles per insn (75.02%) - 2.800182646 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.661123e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.707073e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.707073e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.538386 sec + 10,268,038,737 cycles # 2.898 GHz + 30,334,584,369 instructions # 2.95 insn per cycle + 3.554140482 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.198423e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.218357e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.218357e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.390357 sec - 4,931,396,987 cycles:u # 3.490 GHz (74.79%) - 2,134,488 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) - 1,682,834,199 stalled-cycles-backend:u # 34.12% backend cycles idle (75.09%) - 11,671,002,802 instructions:u # 2.37 insn per cycle - # 0.14 stalled cycles per insn (75.09%) - 1.426396900 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.842618e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.012045e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.012045e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.876874 sec + 5,068,616,518 cycles # 2.693 GHz + 11,664,707,542 instructions # 2.30 insn per cycle + 1.896780245 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.766097e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.969139e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.969139e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.701579 sec + 4,623,474,911 cycles # 2.710 GHz + 10,806,178,257 instructions # 2.34 insn per cycle + 1.712732749 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.261988e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.377447e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.377447e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.282726 sec + 3,962,643,032 cycles # 1.733 GHz + 5,999,265,657 instructions # 1.51 insn per cycle + 2.297742409 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index abb9800c5e..809c0d4a45 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:12:18 +DATE: 2024-01-30_04:58:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.363151e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.537668e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.622058e+06 ) sec^-1 -MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 -TOTAL : 0.462168 sec - 1,368,322,800 cycles:u # 2.802 GHz (73.26%) - 8,194,242 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.37%) - 276,189,769 stalled-cycles-backend:u # 20.18% backend cycles idle (76.22%) - 1,676,657,905 instructions:u # 1.23 insn per cycle - # 0.16 stalled cycles per insn (75.66%) - 0.509375607 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.450759e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.307242e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.403943e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.453655 sec + 1,885,130,441 cycles # 2.809 GHz + 2,653,723,410 instructions # 1.41 insn per cycle + 0.747134110 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.327882e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632793e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.638488e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 -TOTAL : 0.950410 sec - 2,876,862,724 cycles:u # 2.948 GHz (75.32%) - 21,132,083 stalled-cycles-frontend:u # 0.73% frontend cycles idle (75.41%) - 855,313,895 stalled-cycles-backend:u # 29.73% backend cycles idle (75.01%) - 2,764,167,270 instructions:u # 0.96 insn per cycle - # 0.31 stalled cycles per insn (75.09%) - 0.998013250 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.211065e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.390139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.474767e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.497953 sec + 2,053,184,300 cycles # 2.823 GHz + 2,862,941,904 instructions # 1.39 insn per cycle + 0.785494017 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.253792e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.269267e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.269267e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.056922 sec - 17,775,574,452 cycles:u # 3.501 GHz (74.95%) - 2,522,383 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,672,872,120 stalled-cycles-backend:u # 20.66% backend cycles idle (74.95%) - 55,277,772,068 instructions:u # 3.11 insn per cycle - # 0.07 stalled cycles per insn (74.97%) - 5.080122152 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.619709e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.634289e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.276631 sec + 18,176,411,104 cycles # 2.894 GHz + 55,238,700,170 instructions # 3.04 insn per cycle + 6.284146623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.086183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.103464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.103464e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.529604 sec - 5,398,274,120 cycles:u # 3.481 GHz (74.73%) - 2,212,906 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.76%) - 1,663,514,226 stalled-cycles-backend:u # 30.82% backend cycles idle (75.00%) - 16,168,806,091 instructions:u # 3.00 insn per cycle - # 0.10 stalled cycles per insn (75.23%) - 1.554011097 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.447433e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.602543e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.602543e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.961648 sec + 5,691,843,956 cycles # 2.895 GHz + 16,128,541,176 instructions # 2.83 insn per cycle + 1.980848485 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.367753e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.448576e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.448576e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.713114 sec - 2,528,265,935 cycles:u # 3.443 GHz (74.95%) - 1,705,410 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.95%) - 824,805,734 stalled-cycles-backend:u # 32.62% backend cycles idle (74.96%) - 6,091,644,676 instructions:u # 2.41 insn per cycle - # 0.14 stalled cycles per insn (74.96%) - 0.737720577 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.757867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823085e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.954501 sec + 2,591,810,421 cycles # 2.702 GHz + 6,085,915,267 instructions # 2.35 insn per cycle + 0.966912682 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.986474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.069956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.069956e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.846832 sec + 2,295,114,840 cycles # 2.696 GHz + 5,552,751,365 instructions # 2.42 insn per cycle + 0.861502194 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.460942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.506292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.506292e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.145570 sec + 2,022,184,795 cycles # 1.758 GHz + 3,286,748,929 instructions # 1.63 insn per cycle + 1.163999883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 05cbe02364..8f1e29c773 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:46:46 +DATE: 2024-01-30_05:48:04 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.264091e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.629857e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.629857e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 -TOTAL : 0.493262 sec - 1,427,000,492 cycles:u # 2.751 GHz (74.50%) - 11,062,734 stalled-cycles-frontend:u # 0.78% frontend cycles idle (74.75%) - 288,331,422 stalled-cycles-backend:u # 20.21% backend cycles idle (75.47%) - 1,941,450,911 instructions:u # 1.36 insn per cycle - # 0.15 stalled cycles per insn (75.42%) - 0.537422932 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.794241e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.099961e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.099961e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.464854 sec + 1,913,128,801 cycles # 2.831 GHz + 2,814,269,280 instructions # 1.47 insn per cycle + 0.735191571 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.123568e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.467201e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.467201e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 -TOTAL : 1.059350 sec - 3,243,958,406 cycles:u # 2.970 GHz (74.92%) - 29,201,045 stalled-cycles-frontend:u # 0.90% frontend cycles idle (75.16%) - 860,374,227 stalled-cycles-backend:u # 26.52% backend cycles idle (75.17%) - 3,453,925,859 instructions:u # 1.06 insn per cycle - # 0.25 stalled cycles per insn (75.42%) - 1.113397605 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.563056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567773e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567773e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.649992 sec + 2,514,728,119 cycles # 2.840 GHz + 3,857,856,675 instructions # 1.53 insn per cycle + 0.945286461 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.235486e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.250776e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.250776e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.087470 sec - 17,871,733,886 cycles:u # 3.498 GHz (74.95%) - 2,549,807 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,700,745,689 stalled-cycles-backend:u # 20.71% backend cycles idle (74.96%) - 55,318,123,763 instructions:u # 3.10 insn per cycle - # 0.07 stalled cycles per insn (74.98%) - 5.111017597 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.612332e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.626818e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.626818e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.299552 sec + 18,207,767,275 cycles # 2.889 GHz + 55,242,943,760 instructions # 3.03 insn per cycle + 6.304483382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.079389e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096484e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096484e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.542157 sec - 5,440,398,597 cycles:u # 3.480 GHz (74.93%) - 2,113,561 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) - 1,661,597,394 stalled-cycles-backend:u # 30.54% backend cycles idle (74.93%) - 16,214,117,343 instructions:u # 2.98 insn per cycle - # 0.10 stalled cycles per insn (74.94%) - 1.566969925 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.365917e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.522444e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.522444e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.986287 sec + 5,717,011,577 cycles # 2.873 GHz + 16,175,954,346 instructions # 2.83 insn per cycle + 1.991587162 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.359009e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.439177e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.439177e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.718329 sec - 2,573,153,270 cycles:u # 3.478 GHz (74.59%) - 1,760,279 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.85%) - 822,041,230 stalled-cycles-backend:u # 31.95% backend cycles idle (75.14%) - 6,131,508,895 instructions:u # 2.38 insn per cycle - # 0.13 stalled cycles per insn (75.14%) - 0.743296450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.741687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.807547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.807547e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.968315 sec + 2,618,792,433 cycles # 2.693 GHz + 6,122,206,815 instructions # 2.34 insn per cycle + 0.973667021 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.976057e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.060749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060749e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.855993 sec + 2,321,654,642 cycles # 2.699 GHz + 5,589,002,861 instructions # 2.41 insn per cycle + 0.861171520 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.455132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.500155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.500155e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.154878 sec + 2,044,999,339 cycles # 1.765 GHz + 3,327,504,110 instructions # 1.63 insn per cycle + 1.160035358 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 8d47b19a57..71f99cc0f9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:12:35 +DATE: 2024-01-30_04:59:21 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.530693e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.520372e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.597614e+06 ) sec^-1 -MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 -TOTAL : 0.461335 sec - 1,326,167,778 cycles:u # 2.729 GHz (74.59%) - 8,151,718 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.18%) - 278,720,879 stalled-cycles-backend:u # 21.02% backend cycles idle (75.49%) - 1,659,800,184 instructions:u # 1.25 insn per cycle - # 0.17 stalled cycles per insn (75.76%) - 0.503880984 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.454028e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326749e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.426065e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.451891 sec + 1,884,922,304 cycles # 2.826 GHz + 2,675,153,942 instructions # 1.42 insn per cycle + 0.742708600 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.393837e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.690545e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.695908e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 -TOTAL : 0.949511 sec - 2,908,553,453 cycles:u # 2.982 GHz (74.54%) - 20,937,653 stalled-cycles-frontend:u # 0.72% frontend cycles idle (75.32%) - 852,141,150 stalled-cycles-backend:u # 29.30% backend cycles idle (75.32%) - 2,753,093,077 instructions:u # 0.95 insn per cycle - # 0.31 stalled cycles per insn (75.36%) - 0.993929305 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.211971e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.383449e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.465566e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.498447 sec + 2,066,378,383 cycles # 2.841 GHz + 2,912,189,828 instructions # 1.41 insn per cycle + 0.785425719 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669531526541 -Relative difference = 0.0005401805380429868 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.237369e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.252608e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.252608e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.082280 sec - 17,865,629,686 cycles:u # 3.501 GHz (74.92%) - 2,498,978 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.86%) - 2,991,351,501 stalled-cycles-backend:u # 16.74% backend cycles idle (74.94%) - 55,034,375,096 instructions:u # 3.08 insn per cycle - # 0.05 stalled cycles per insn (75.08%) - 5.105994221 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.621420e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635929e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635929e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.272536 sec + 18,133,908,438 cycles # 2.889 GHz + 54,991,536,969 instructions # 3.03 insn per cycle + 6.280002049 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.113934e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132151e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.132151e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.491759 sec - 5,272,618,876 cycles:u # 3.485 GHz (74.88%) - 1,965,606 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.11%) - 1,542,389,812 stalled-cycles-backend:u # 29.25% backend cycles idle (75.15%) - 16,244,718,537 instructions:u # 3.08 insn per cycle - # 0.09 stalled cycles per insn (75.15%) - 1.515955960 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.675526e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.845155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.845155e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.910252 sec + 5,541,476,355 cycles # 2.894 GHz + 16,222,950,904 instructions # 2.93 insn per cycle + 1.926546393 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857712652836 -Relative difference = 1.618803841657786e-07 +Avg ME (F77/C++) = 1.4129863487235070 +Relative difference = 2.4679898241023883e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.099762e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.163094e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.163094e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.801224 sec - 2,855,360,913 cycles:u # 3.472 GHz (74.79%) - 1,399,697 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.72%) - 808,361,645 stalled-cycles-backend:u # 28.31% backend cycles idle (74.72%) - 6,739,199,559 instructions:u # 2.36 insn per cycle - # 0.12 stalled cycles per insn (74.42%) - 0.825544357 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.524928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.573974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.573974e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.096795 sec + 2,981,881,341 cycles # 2.708 GHz + 6,708,240,605 instructions # 2.25 insn per cycle + 1.109848469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.679205e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.738776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.738776e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.997879 sec + 2,711,169,290 cycles # 2.704 GHz + 6,222,713,478 instructions # 2.30 insn per cycle + 1.012945753 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.374736e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.414577e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.414577e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.216016 sec + 2,159,440,418 cycles # 1.769 GHz + 3,642,249,109 instructions # 1.69 insn per cycle + 1.228978695 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 6bc3c8b3b5..c3bf1d184f 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:12:51 +DATE: 2024-01-30_04:59:47 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.930611e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.079940e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.085510e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.525099 sec - 1,547,001,642 cycles:u # 2.809 GHz (74.40%) - 8,231,293 stalled-cycles-frontend:u # 0.53% frontend cycles idle (74.40%) - 291,649,341 stalled-cycles-backend:u # 18.85% backend cycles idle (75.29%) - 1,834,262,540 instructions:u # 1.19 insn per cycle - # 0.16 stalled cycles per insn (75.20%) - 0.569315068 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.711100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041363e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056144e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.470768 sec + 1,937,905,575 cycles # 2.825 GHz + 2,769,085,725 instructions # 1.43 insn per cycle + 0.764309757 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.622428e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.843086e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.848194e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.126457 sec - 3,453,955,010 cycles:u # 2.984 GHz (74.93%) - 21,196,879 stalled-cycles-frontend:u # 0.61% frontend cycles idle (75.18%) - 853,666,812 stalled-cycles-backend:u # 24.72% backend cycles idle (75.18%) - 3,203,854,630 instructions:u # 0.93 insn per cycle - # 0.27 stalled cycles per insn (75.02%) - 1.177341147 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.077034e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326400e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.614129 sec + 2,415,403,751 cycles # 2.830 GHz + 3,662,132,699 instructions # 1.52 insn per cycle + 0.915037914 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.871634e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.883278e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.883278e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.729194 sec - 20,128,898,569 cycles:u # 3.501 GHz (74.96%) - 2,785,588 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 3,920,733,211 stalled-cycles-backend:u # 19.48% backend cycles idle (74.96%) - 59,182,030,526 instructions:u # 2.94 insn per cycle - # 0.07 stalled cycles per insn (74.97%) - 5.752521724 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.370924e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.382846e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382846e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.936117 sec + 19,978,394,912 cycles # 2.879 GHz + 59,162,561,873 instructions # 2.96 insn per cycle + 6.944191465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.102145e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.154461e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.154461e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.708500 sec - 9,541,976,491 cycles:u # 3.495 GHz (74.85%) - 2,438,086 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.84%) - 2,419,701,974 stalled-cycles-backend:u # 25.36% backend cycles idle (74.98%) - 29,758,024,267 instructions:u # 3.12 insn per cycle - # 0.08 stalled cycles per insn (75.09%) - 2.733342510 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.694585e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.741387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.741387e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.513998 sec + 10,104,341,088 cycles # 2.872 GHz + 29,763,867,436 instructions # 2.95 insn per cycle + 3.532062820 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.245529e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.267081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.267081e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.338833 sec - 4,724,288,846 cycles:u # 3.473 GHz (74.72%) - 2,391,653 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) - 1,570,899,905 stalled-cycles-backend:u # 33.25% backend cycles idle (74.89%) - 11,230,417,054 instructions:u # 2.38 insn per cycle - # 0.14 stalled cycles per insn (75.15%) - 1.363499703 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.157849e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.336120e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.336120e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.813275 sec + 4,888,809,789 cycles # 2.689 GHz + 11,200,775,616 instructions # 2.29 insn per cycle + 1.831194346 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.059295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083013e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.571072 sec + 4,240,948,322 cycles # 2.691 GHz + 10,146,075,765 instructions # 2.39 insn per cycle + 1.585395140 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.157625e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.268151e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.268151e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.315387 sec + 4,011,221,101 cycles # 1.729 GHz + 5,838,969,816 instructions # 1.46 insn per cycle + 2.328222904 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index ca039e8e9e..0465a21327 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-28_13:13:11 +DATE: 2024-01-30_05:00:18 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.923635e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.080806e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.086473e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.526606 sec - 1,546,779,017 cycles:u # 2.806 GHz (73.78%) - 8,361,783 stalled-cycles-frontend:u # 0.54% frontend cycles idle (73.85%) - 297,190,296 stalled-cycles-backend:u # 19.21% backend cycles idle (75.51%) - 1,818,934,608 instructions:u # 1.18 insn per cycle - # 0.16 stalled cycles per insn (75.28%) - 0.568562997 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.666023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032901e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046936e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.468226 sec + 1,937,873,508 cycles # 2.824 GHz + 2,759,069,461 instructions # 1.42 insn per cycle + 0.754370762 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.600430e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.833337e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838870e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.122231 sec - 3,467,237,376 cycles:u # 3.012 GHz (74.99%) - 21,359,651 stalled-cycles-frontend:u # 0.62% frontend cycles idle (75.00%) - 853,272,050 stalled-cycles-backend:u # 24.61% backend cycles idle (75.23%) - 3,214,568,864 instructions:u # 0.93 insn per cycle - # 0.27 stalled cycles per insn (75.19%) - 1.172263402 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.070939e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304690e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318717e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607643 sec + 2,403,195,178 cycles # 2.827 GHz + 3,555,740,714 instructions # 1.48 insn per cycle + 0.909921855 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.896449e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.908309e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.908309e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.680023 sec - 19,974,916,799 cycles:u # 3.503 GHz (74.94%) - 2,710,663 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 3,510,471,735 stalled-cycles-backend:u # 17.57% backend cycles idle (75.03%) - 58,726,429,808 instructions:u # 2.94 insn per cycle - # 0.06 stalled cycles per insn (75.03%) - 5.704403658 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.404971e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.417275e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.417275e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.840299 sec + 19,736,673,501 cycles # 2.886 GHz + 58,709,690,472 instructions # 2.97 insn per cycle + 6.847451518 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.162282e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.215492e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.215492e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.682251 sec - 9,442,855,192 cycles:u # 3.492 GHz (74.85%) - 2,428,870 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) - 2,135,949,636 stalled-cycles-backend:u # 22.62% backend cycles idle (74.94%) - 30,202,697,241 instructions:u # 3.20 insn per cycle - # 0.07 stalled cycles per insn (75.07%) - 2.707217238 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.708829e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.755468e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.755468e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.503073 sec + 10,118,973,746 cycles # 2.885 GHz + 30,158,905,101 instructions # 2.98 insn per cycle + 3.519090284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.219654e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.240294e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.240294e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.366512 sec - 4,827,563,399 cycles:u # 3.479 GHz (74.66%) - 2,406,118 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) - 1,568,708,705 stalled-cycles-backend:u # 32.49% backend cycles idle (75.02%) - 11,675,178,552 instructions:u # 2.42 insn per cycle - # 0.13 stalled cycles per insn (75.22%) - 1.390974056 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.784663e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.950747e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.950747e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.889159 sec + 5,039,949,395 cycles # 2.661 GHz + 11,663,409,755 instructions # 2.31 insn per cycle + 1.981495827 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.838137e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.004758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004758e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.689521 sec + 4,555,347,979 cycles # 2.689 GHz + 10,787,640,248 instructions # 2.37 insn per cycle + 1.702819632 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.077813e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.181685e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.181685e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.340568 sec + 4,064,413,524 cycles # 1.733 GHz + 6,073,601,897 instructions # 1.49 insn per cycle + 2.356439472 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 4072135c25..53bd28a5bd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:13:30 +DATE: 2024-01-30_05:00:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.463763e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.620026e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.621227e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.644635 sec - 1,987,376,841 cycles:u # 2.988 GHz (74.74%) - 2,457,671 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.57%) - 48,993,592 stalled-cycles-backend:u # 2.47% backend cycles idle (74.73%) - 2,144,827,242 instructions:u # 1.08 insn per cycle - # 0.02 stalled cycles per insn (75.31%) - 0.688992738 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.507010e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.536029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.538733e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.531942 sec + 2,193,462,671 cycles # 2.834 GHz + 3,356,973,773 instructions # 1.53 insn per cycle + 0.849346656 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.244104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.246928e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.246989e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.378258 sec - 28,854,966,045 cycles:u # 3.432 GHz (74.98%) - 11,706,323 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 1,119,668,748 stalled-cycles-backend:u # 3.88% backend cycles idle (75.00%) - 22,680,994,886 instructions:u # 0.79 insn per cycle - # 0.05 stalled cycles per insn (74.98%) - 8.429615859 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.126743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.160620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162100e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.043092 sec + 9,489,937,514 cycles # 2.875 GHz + 19,463,317,431 instructions # 2.05 insn per cycle + 3.359518634 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.214110e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.214999e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.214999e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.414695 sec - 26,053,581,658 cycles:u # 3.504 GHz (74.97%) - 11,686,455 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.02%) - 3,872,816,291 stalled-cycles-backend:u # 14.86% backend cycles idle (74.99%) - 81,778,201,866 instructions:u # 3.14 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 7.438325461 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.787937e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.788754e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.788754e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.184792 sec + 26,445,376,310 cycles # 2.879 GHz + 81,759,262,253 instructions # 3.09 insn per cycle + 9.200099621 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.016879e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.021423e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.021423e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.276622 sec - 11,519,938,608 cycles:u # 3.493 GHz (75.01%) - 1,061,411 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 1,601,322,638 stalled-cycles-backend:u # 13.90% backend cycles idle (75.03%) - 39,253,937,285 instructions:u # 3.41 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.301260420 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.595033e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598347e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598347e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.572055 sec + 12,894,491,420 cycles # 2.818 GHz + 39,242,650,330 instructions # 3.04 insn per cycle + 4.588188651 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.199974e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202544e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.202544e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.374389 sec - 4,859,374,228 cycles:u # 3.481 GHz (74.89%) - 725,368 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.79%) - 611,853,703 stalled-cycles-backend:u # 12.59% backend cycles idle (74.79%) - 13,816,024,978 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (74.83%) - 1.399253157 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.988905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.005063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.005063e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.062926 sec + 5,559,157,847 cycles # 2.689 GHz + 13,789,744,695 instructions # 2.48 insn per cycle + 2.079268197 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.113130e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.134504e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.134504e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.809806 sec + 4,899,980,729 cycles # 2.701 GHz + 12,319,200,932 instructions # 2.51 insn per cycle + 1.824526773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.926484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.938620e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.938620e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.377893 sec + 4,078,713,187 cycles # 1.712 GHz + 6,287,612,851 instructions # 1.54 insn per cycle + 2.391138362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 49977cc58b..ba45d149aa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:47:22 +DATE: 2024-01-30_05:49:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.390201e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.527612e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.527612e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.654035 sec - 1,960,717,262 cycles:u # 2.905 GHz (75.00%) - 2,777,141 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.21%) - 33,974,103 stalled-cycles-backend:u # 1.73% backend cycles idle (75.21%) - 2,149,307,410 instructions:u # 1.10 insn per cycle - # 0.02 stalled cycles per insn (75.22%) - 0.698063329 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.099881e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.447326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447326e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.520843 sec + 2,128,361,154 cycles # 2.833 GHz + 3,379,769,914 instructions # 1.59 insn per cycle + 0.811374229 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.211104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245812e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.532378 sec - 29,230,109,981 cycles:u # 3.409 GHz (75.01%) - 22,642,081 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) - 1,132,164,723 stalled-cycles-backend:u # 3.87% backend cycles idle (75.00%) - 23,523,574,924 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (74.96%) - 8.593019725 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.602295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.096469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.096469e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.329754 sec + 10,358,756,104 cycles # 2.872 GHz + 22,944,085,739 instructions # 2.21 insn per cycle + 3.663397648 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.203965e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204858e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204858e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.452449 sec - 26,155,459,895 cycles:u # 3.500 GHz (74.95%) - 19,996,051 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.91%) - 3,848,748,935 stalled-cycles-backend:u # 14.71% backend cycles idle (74.97%) - 81,737,604,098 instructions:u # 3.13 insn per cycle - # 0.05 stalled cycles per insn (75.06%) - 7.476303530 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.794771e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795632e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.153536 sec + 26,441,951,782 cycles # 2.888 GHz + 81,759,972,796 instructions # 3.09 insn per cycle + 9.158879879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.014996e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.019514e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.019514e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.281547 sec - 11,541,920,268 cycles:u # 3.494 GHz (74.90%) - 1,107,312 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 1,695,248,873 stalled-cycles-backend:u # 14.69% backend cycles idle (75.06%) - 39,247,079,084 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 3.306741027 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.577595e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.580993e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580993e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.598491 sec + 12,916,287,273 cycles # 2.806 GHz + 39,254,753,938 instructions # 3.04 insn per cycle + 4.603937867 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.194708e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.197264e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.197264e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.383983 sec - 4,882,147,839 cycles:u # 3.472 GHz (75.05%) - 3,172,657 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 581,410,592 stalled-cycles-backend:u # 11.91% backend cycles idle (74.97%) - 13,796,051,404 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 1.409166602 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.852795e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.869019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.869019e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.103011 sec + 5,568,678,671 cycles # 2.642 GHz + 13,799,771,926 instructions # 2.48 insn per cycle + 2.108561686 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.035305e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.056800e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.056800e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.829572 sec + 4,921,598,332 cycles # 2.684 GHz + 12,328,469,851 instructions # 2.50 insn per cycle + 1.835230648 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.926825e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.939647e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.939647e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.382359 sec + 4,075,002,441 cycles # 1.707 GHz + 6,297,411,526 instructions # 1.55 insn per cycle + 2.387952463 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index c540183c31..2624aa384f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:56:28 +DATE: 2024-01-30_06:01:23 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.425613e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.588900e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.590114e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.497090e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.524372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.526818e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.647005 sec - 1,961,714,283 cycles:u # 2.934 GHz (74.69%) - 2,370,613 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.34%) - 37,765,063 stalled-cycles-backend:u # 1.93% backend cycles idle (75.10%) - 2,176,691,970 instructions:u # 1.11 insn per cycle - # 0.02 stalled cycles per insn (74.75%) - 0.689251287 seconds time elapsed +TOTAL : 0.512897 sec + 2,098,983,374 cycles # 2.834 GHz + 3,277,353,449 instructions # 1.56 insn per cycle + 0.803360908 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.243727e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.247395e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.247456e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.141054e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.174803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176256e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.384949 sec - 28,823,443,221 cycles:u # 3.423 GHz (75.00%) - 11,685,573 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.02%) - 1,119,948,130 stalled-cycles-backend:u # 3.89% backend cycles idle (75.00%) - 22,700,974,781 instructions:u # 0.79 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 8.437711468 seconds time elapsed +TOTAL : 3.134385 sec + 9,742,325,189 cycles # 2.872 GHz + 21,219,396,735 instructions # 2.18 insn per cycle + 3.451782991 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.205977e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.206957e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.206957e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.789322e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.790133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.790133e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.441977 sec - 26,144,784,083 cycles:u # 3.503 GHz (74.97%) - 19,757,570 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.02%) - 3,926,698,235 stalled-cycles-backend:u # 15.02% backend cycles idle (75.03%) - 81,747,372,330 instructions:u # 3.13 insn per cycle - # 0.05 stalled cycles per insn (75.03%) - 7.465457036 seconds time elapsed +TOTAL : 9.178912 sec + 26,467,085,384 cycles # 2.885 GHz + 81,758,395,147 instructions # 3.09 insn per cycle + 9.184185479 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.016218e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.020848e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.020848e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.580434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.583873e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.583873e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.277056 sec - 11,521,723,848 cycles:u # 3.494 GHz (75.02%) - 1,067,616 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 1,661,234,169 stalled-cycles-backend:u # 14.42% backend cycles idle (75.02%) - 39,259,477,727 instructions:u # 3.41 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.299831090 seconds time elapsed +TOTAL : 4.592041 sec + 12,908,303,532 cycles # 2.809 GHz + 39,241,301,392 instructions # 3.04 insn per cycle + 4.597199751 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.202912e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205517e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205517e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.006274e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.022952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.022952e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.370984 sec - 4,837,328,450 cycles:u # 3.475 GHz (74.72%) - 783,029 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.76%) - 584,048,854 stalled-cycles-backend:u # 12.07% backend cycles idle (74.91%) - 13,836,924,664 instructions:u # 2.86 insn per cycle - # 0.04 stalled cycles per insn (75.16%) - 1.393799936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +TOTAL : 2.060575 sec + 5,561,277,799 cycles # 2.694 GHz + 13,787,529,346 instructions # 2.48 insn per cycle + 2.065507699 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.108001e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.130506e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.130506e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.812494 sec + 4,903,037,786 cycles # 2.699 GHz + 12,315,866,756 instructions # 2.51 insn per cycle + 1.817504411 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.888313e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.900941e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.900941e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.393126 sec + 4,056,497,728 cycles # 1.692 GHz + 6,284,230,028 instructions # 1.55 insn per cycle + 2.398190383 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 6361605e5d..711141aac6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,143 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:54:23 +DATE: 2024-01-30_05:57:54 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.493459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.521487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524248e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.509745 sec + 2,103,800,649 cycles # 2.836 GHz + 3,325,789,020 instructions # 1.58 insn per cycle + 0.801087014 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 54,670,069 cycles:u # 2.630 GHz (61.54%) - 34,994 stalled-cycles-frontend:u # 0.06% frontend cycles idle (61.54%) - 616,163 stalled-cycles-backend:u # 1.13% backend cycles idle (61.54%) - 42,435,100 instructions:u # 0.78 insn per cycle - # 0.01 stalled cycles per insn (63.55%) - 0.021700778 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 44,892,864 cycles:u # 2.190 GHz (61.00%) - 64,258 stalled-cycles-frontend:u # 0.14% frontend cycles idle (61.01%) - 460,979 stalled-cycles-backend:u # 1.03% backend cycles idle (61.00%) - 48,182,281 instructions:u # 1.07 insn per cycle - # 0.01 stalled cycles per insn (70.82%) - 0.021357952 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.145753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.180203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.181702e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.079125 sec + 9,600,126,952 cycles # 2.878 GHz + 21,681,876,510 instructions # 2.26 insn per cycle + 3.393235673 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted - 41,803,467 cycles:u # 2.035 GHz (61.10%) - 61,169 stalled-cycles-frontend:u # 0.15% frontend cycles idle (61.10%) - 353,876 stalled-cycles-backend:u # 0.85% backend cycles idle (61.27%) - 48,282,000 instructions:u # 1.15 insn per cycle - # 0.01 stalled cycles per insn (74.13%) - 0.022261377 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.797758e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.798583e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.798583e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.135643 sec + 26,454,582,305 cycles # 2.896 GHz + 81,754,058,548 instructions # 3.09 insn per cycle + 9.140745485 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted - 50,214,491 cycles:u # 2.486 GHz (60.43%) - 49,776 stalled-cycles-frontend:u # 0.10% frontend cycles idle (60.43%) - 575,575 stalled-cycles-backend:u # 1.15% backend cycles idle (60.43%) - 45,981,314 instructions:u # 0.92 insn per cycle - # 0.01 stalled cycles per insn (64.01%) - 0.021462604 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.597207e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600539e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600539e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.569387 sec + 12,892,653,048 cycles # 2.819 GHz + 39,241,760,724 instructions # 3.04 insn per cycle + 4.574378716 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted - 39,374,385 cycles:u # 1.907 GHz (61.30%) - 58,366 stalled-cycles-frontend:u # 0.15% frontend cycles idle (61.30%) - 361,633 stalled-cycles-backend:u # 0.92% backend cycles idle (56.85%) - 49,010,405 instructions:u # 1.24 insn per cycle - # 0.01 stalled cycles per insn (76.20%) - 0.021854446 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.978625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.995447e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.995447e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.065609 sec + 5,559,302,417 cycles # 2.687 GHz + 13,789,202,442 instructions # 2.48 insn per cycle + 2.071000161 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.097696e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.119952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.119952e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.812868 sec + 4,896,837,509 cycles # 2.695 GHz + 12,317,770,581 instructions # 2.52 insn per cycle + 1.818257681 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.967466e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.979997e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.979997e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.364567 sec + 4,060,623,360 cycles # 1.715 GHz + 6,286,167,500 instructions # 1.55 insn per cycle + 2.369629620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 6617ca121c..de6151d7b3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,181 +1,226 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:52:34 +DATE: 2024-01-30_05:54:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.455736e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.587953e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.589041e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.644943 sec - 1,960,906,381 cycles:u # 2.930 GHz (74.99%) - 2,678,732 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.16%) - 41,502,500 stalled-cycles-backend:u # 2.12% backend cycles idle (75.09%) - 2,164,012,883 instructions:u # 1.10 insn per cycle - # 0.02 stalled cycles per insn (74.77%) - 0.685694018 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.181803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.496640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499302e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.513215 sec + 2,110,243,378 cycles # 2.841 GHz + 3,364,158,559 instructions # 1.59 insn per cycle + 0.803846009 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.214834e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.247081e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.247142e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.477859 sec - 29,184,033,298 cycles:u # 3.428 GHz (74.95%) - 23,042,044 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.96%) - 1,124,504,964 stalled-cycles-backend:u # 3.85% backend cycles idle (75.01%) - 23,480,301,219 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 8.533410082 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.724341e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178501e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.211617 sec + 9,930,008,722 cycles # 2.863 GHz + 21,629,593,771 instructions # 2.18 insn per cycle + 3.536993543 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.203599e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204480e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204480e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.450038 sec - 26,153,434,105 cycles:u # 3.501 GHz (74.94%) - 21,271,365 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.95%) - 3,954,646,940 stalled-cycles-backend:u # 15.12% backend cycles idle (75.00%) - 81,744,725,764 instructions:u # 3.13 insn per cycle - # 0.05 stalled cycles per insn (75.05%) - 7.472964771 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.795958e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796814e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.796814e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.141774 sec + 26,442,082,623 cycles # 2.892 GHz + 81,755,899,902 instructions # 3.09 insn per cycle + 9.146895276 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.015183e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.019740e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.019740e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.277876 sec - 11,548,473,130 cycles:u # 3.501 GHz (74.92%) - 1,063,310 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 1,663,417,717 stalled-cycles-backend:u # 14.40% backend cycles idle (75.03%) - 39,248,215,923 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 3.300947686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.584252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.587667e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.587667e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.585664 sec + 12,903,354,074 cycles # 2.812 GHz + 39,243,037,589 instructions # 3.04 insn per cycle + 4.591083081 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.197867e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200436e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.200436e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.376750 sec - 4,853,545,908 cycles:u # 3.473 GHz (74.82%) - 745,705 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.82%) - 584,157,548 stalled-cycles-backend:u # 12.04% backend cycles idle (74.69%) - 13,847,035,835 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 1.399345898 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.993074e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.009513e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.009513e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.061753 sec + 5,556,410,491 cycles # 2.690 GHz + 13,788,754,708 instructions # 2.48 insn per cycle + 2.066810636 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.089775e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.111272e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.111272e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.814284 sec + 4,898,229,262 cycles # 2.694 GHz + 12,317,871,193 instructions # 2.51 insn per cycle + 1.819291757 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.893591e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.906421e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.906421e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.392306 sec + 4,056,818,337 cycles # 1.695 GHz + 6,287,135,022 instructions # 1.55 insn per cycle + 2.397424437 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index d771088f9e..ce8b9bfd9b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:14:00 +DATE: 2024-01-30_05:01:27 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.380880e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.440819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.441063e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.526573 sec - 1,550,068,082 cycles:u # 2.831 GHz (74.80%) - 2,379,457 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.92%) - 41,567,895 stalled-cycles-backend:u # 2.68% backend cycles idle (75.11%) - 1,842,006,083 instructions:u # 1.19 insn per cycle - # 0.02 stalled cycles per insn (75.31%) - 0.569023795 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.464704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496312e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.530379 sec + 2,191,459,528 cycles # 2.836 GHz + 3,378,194,635 instructions # 1.54 insn per cycle + 0.862349447 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.738074e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.743433e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.743544e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 7.022676 sec - 24,105,089,398 cycles:u # 3.417 GHz (74.92%) - 11,392,847 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.96%) - 1,121,387,423 stalled-cycles-backend:u # 4.65% backend cycles idle (75.04%) - 19,016,168,216 instructions:u # 0.79 insn per cycle - # 0.06 stalled cycles per insn (75.06%) - 7.072805960 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.136041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.170363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.171805e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.033894 sec + 9,468,330,012 cycles # 2.874 GHz + 21,262,061,450 instructions # 2.25 insn per cycle + 3.350179318 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.209445e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.210340e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.210340e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.430105 sec - 26,085,936,101 cycles:u # 3.501 GHz (74.98%) - 11,236,918 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.98%) - 3,350,397,424 stalled-cycles-backend:u # 12.84% backend cycles idle (74.99%) - 81,795,777,968 instructions:u # 3.14 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 7.453494108 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.798336e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799213e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.130694 sec + 26,439,863,153 cycles # 2.895 GHz + 81,781,637,155 instructions # 3.09 insn per cycle + 9.163718345 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.970390e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.974839e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.974839e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.306999 sec - 11,648,485,988 cycles:u # 3.500 GHz (74.98%) - 9,866,301 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) - 1,494,462,993 stalled-cycles-backend:u # 12.83% backend cycles idle (75.00%) - 39,254,097,381 instructions:u # 3.37 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 3.331484334 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.559639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.562995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.562995e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.616847 sec + 12,919,257,236 cycles # 2.796 GHz + 39,249,733,665 instructions # 3.04 insn per cycle + 4.636578065 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.197117e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199676e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.199676e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.377372 sec - 4,863,607,601 cycles:u # 3.477 GHz (74.84%) - 1,813,839 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.84%) - 599,987,605 stalled-cycles-backend:u # 12.34% backend cycles idle (74.84%) - 13,847,758,852 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (74.89%) - 1.401964703 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.030089e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.046612e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.046612e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.052281 sec + 5,556,604,473 cycles # 2.701 GHz + 13,805,088,947 instructions # 2.48 insn per cycle + 2.071717259 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.135265e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.157006e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.157006e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.804981 sec + 4,885,090,375 cycles # 2.700 GHz + 12,330,030,988 instructions # 2.52 insn per cycle + 1.821790981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.917661e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.930225e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.930225e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.381269 sec + 4,053,625,505 cycles # 1.699 GHz + 6,293,972,632 instructions # 1.55 insn per cycle + 2.398074513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 5a0d1e0231..466f11943e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:33:33 +DATE: 2024-01-30_05:37:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.432864e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.607586e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.609131e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.647725 sec - 1,965,032,551 cycles:u # 2.938 GHz (74.40%) - 2,457,806 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.84%) - 41,722,257 stalled-cycles-backend:u # 2.12% backend cycles idle (74.83%) - 2,120,082,784 instructions:u # 1.08 insn per cycle - # 0.02 stalled cycles per insn (75.87%) - 0.693866618 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.224805e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.249067e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252384e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.539977 sec + 2,169,108,553 cycles # 2.827 GHz + 3,309,870,321 instructions # 1.53 insn per cycle + 0.827427226 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.246947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.249779e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.249840e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.369635 sec - 28,801,506,279 cycles:u # 3.428 GHz (74.94%) - 11,544,325 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) - 1,125,531,027 stalled-cycles-backend:u # 3.91% backend cycles idle (75.06%) - 22,691,924,055 instructions:u # 0.79 insn per cycle - # 0.05 stalled cycles per insn (75.11%) - 8.425153536 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.771192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.799798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801021e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.309705 sec + 10,258,793,873 cycles # 2.876 GHz + 23,623,503,831 instructions # 2.30 insn per cycle + 3.624842760 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.588775e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.589159e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.589159e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 35.747768 sec - 125,373,892,272 cycles:u # 3.505 GHz (75.00%) - 90,420,899 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) - 18,811,489,489 stalled-cycles-backend:u # 15.00% backend cycles idle (74.99%) - 141,480,205,816 instructions:u # 1.13 insn per cycle - # 0.13 stalled cycles per insn (75.00%) - 35.771306562 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.186471e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186937e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186937e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.185516 sec + 112,945,518,025 cycles # 2.882 GHz + 141,519,786,794 instructions # 1.25 insn per cycle + 39.190901211 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.643252e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.645635e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.645635e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.509497 sec - 15,846,751,189 cycles:u # 3.497 GHz (74.93%) - 1,161,342 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.93%) - 6,805,781,414 stalled-cycles-backend:u # 42.95% backend cycles idle (74.94%) - 37,571,663,160 instructions:u # 2.37 insn per cycle - # 0.18 stalled cycles per insn (74.99%) - 4.534445045 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.072790e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.075243e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.075243e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.347436 sec + 14,950,247,924 cycles # 2.794 GHz + 37,533,141,644 instructions # 2.51 insn per cycle + 5.352716029 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.568945e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.579223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.579223e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.174735 sec - 7,662,049,435 cycles:u # 3.489 GHz (74.93%) - 1,838,193 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) - 4,393,495,514 stalled-cycles-backend:u # 57.34% backend cycles idle (74.87%) - 12,965,579,204 instructions:u # 1.69 insn per cycle - # 0.34 stalled cycles per insn (74.89%) - 2.199658630 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.349404e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.363561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.363561e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.242056 sec + 6,032,020,393 cycles # 2.685 GHz + 12,947,712,227 instructions # 2.15 insn per cycle + 2.247452761 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.895381e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.916043e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.916043e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.857617 sec + 4,999,907,297 cycles # 2.689 GHz + 11,364,404,504 instructions # 2.27 insn per cycle + 1.863061758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.220172e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.234094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.234094e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.282224 sec + 3,899,980,695 cycles # 1.706 GHz + 5,854,430,419 instructions # 1.50 insn per cycle + 2.287473513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 4cdfc5e542..5156a1b6a3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:34:37 +DATE: 2024-01-30_05:38:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.418171e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.479591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.480154e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.525474 sec - 1,515,282,519 cycles:u # 2.761 GHz (75.24%) - 2,314,963 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.16%) - 49,253,571 stalled-cycles-backend:u # 3.25% backend cycles idle (75.70%) - 1,836,383,781 instructions:u # 1.21 insn per cycle - # 0.03 stalled cycles per insn (75.69%) - 0.574068605 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.248555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276174e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.534739 sec + 2,168,342,582 cycles # 2.838 GHz + 3,393,710,794 instructions # 1.57 insn per cycle + 0.822006178 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.738621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.744167e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.744286e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 7.027729 sec - 24,090,392,814 cycles:u # 3.412 GHz (74.96%) - 11,578,237 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) - 1,124,890,854 stalled-cycles-backend:u # 4.67% backend cycles idle (75.06%) - 18,999,230,554 instructions:u # 0.79 insn per cycle - # 0.06 stalled cycles per insn (75.08%) - 7.079579939 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.787191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.816239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.817488e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.282278 sec + 10,172,932,501 cycles # 2.876 GHz + 20,641,658,708 instructions # 2.03 insn per cycle + 3.596374566 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.525947e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.526320e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.526320e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 36.243408 sec - 127,088,498,504 cycles:u # 3.505 GHz (74.99%) - 73,503,075 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.00%) - 17,996,871,744 stalled-cycles-backend:u # 14.16% backend cycles idle (75.01%) - 141,684,901,564 instructions:u # 1.11 insn per cycle - # 0.13 stalled cycles per insn (75.01%) - 36.267040644 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.152053e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.152498e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.152498e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.509903 sec + 113,989,864,763 cycles # 2.886 GHz + 141,709,117,860 instructions # 1.24 insn per cycle + 39.515181315 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.601252e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.603641e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.603641e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.561717 sec - 16,042,436,101 cycles:u # 3.500 GHz (74.91%) - 11,366,493 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.93%) - 6,568,426,217 stalled-cycles-backend:u # 40.94% backend cycles idle (75.00%) - 37,595,433,276 instructions:u # 2.34 insn per cycle - # 0.17 stalled cycles per insn (75.04%) - 4.586797272 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.077703e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.080226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.080226e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.341972 sec + 14,900,472,017 cycles # 2.788 GHz + 37,594,155,695 instructions # 2.52 insn per cycle + 5.347186768 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.703634e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.714267e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.714267e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.136636 sec - 7,530,800,749 cycles:u # 3.489 GHz (74.86%) - 765,598 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) - 4,267,023,774 stalled-cycles-backend:u # 56.66% backend cycles idle (74.83%) - 12,869,855,404 instructions:u # 1.71 insn per cycle - # 0.33 stalled cycles per insn (74.99%) - 2.161501278 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.479123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.493428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.493428e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.203003 sec + 5,937,038,542 cycles # 2.690 GHz + 12,831,821,287 instructions # 2.16 insn per cycle + 2.208347742 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.959391e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.980227e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.980227e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.840604 sec + 4,989,362,539 cycles # 2.704 GHz + 11,359,801,014 instructions # 2.28 insn per cycle + 1.846082122 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.264695e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.278525e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.278525e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.267781 sec + 3,893,427,498 cycles # 1.714 GHz + 5,843,815,532 instructions # 1.50 insn per cycle + 2.273034135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 95d88aeb3b..aecab864cd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:14:29 +DATE: 2024-01-30_05:02:06 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.571689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781012e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.782456e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.433434 sec - 1,206,803,698 cycles:u # 2.663 GHz (75.31%) - 2,902,559 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.13%) - 50,941,529 stalled-cycles-backend:u # 4.22% backend cycles idle (75.27%) - 1,597,895,307 instructions:u # 1.32 insn per cycle - # 0.03 stalled cycles per insn (75.87%) - 0.476222044 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.329622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387810e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.486367 sec + 1,996,254,093 cycles # 2.831 GHz + 2,951,017,935 instructions # 1.48 insn per cycle + 0.792595596 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.697888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.731044e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.731472e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.294945 sec - 11,116,405,588 cycles:u # 3.346 GHz (74.89%) - 27,989,650 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.02%) - 1,135,046,392 stalled-cycles-backend:u # 10.21% backend cycles idle (74.96%) - 8,992,520,587 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (74.96%) - 3.342805006 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.619469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.695026e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.698446e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.718352 sec + 5,604,348,056 cycles # 2.870 GHz + 11,484,891,091 instructions # 2.05 insn per cycle + 2.010002941 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.453805e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.454845e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.454845e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.689474 sec - 23,500,390,704 cycles:u # 3.502 GHz (74.96%) - 1,349,268 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,707,310,950 stalled-cycles-backend:u # 11.52% backend cycles idle (74.96%) - 75,903,291,879 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 6.712419100 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.963446e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964435e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964435e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.362127 sec + 24,202,873,915 cycles # 2.893 GHz + 75,878,244,924 instructions # 3.14 insn per cycle + 8.372784572 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.874746e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.892103e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.892103e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.666769 sec - 5,880,877,173 cycles:u # 3.485 GHz (74.88%) - 736,307 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.89%) - 919,739,361 stalled-cycles-backend:u # 15.64% backend cycles idle (74.89%) - 20,182,215,345 instructions:u # 3.43 insn per cycle - # 0.05 stalled cycles per insn (74.78%) - 1.690998280 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.122204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.135618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.135618e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.311249 sec + 6,498,315,380 cycles # 2.806 GHz + 20,115,878,445 instructions # 3.10 insn per cycle + 2.327706318 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.349433e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359555e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359555e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.704009 sec - 2,508,283,415 cycles:u # 3.459 GHz (74.63%) - 560,879 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.63%) - 244,621,289 stalled-cycles-backend:u # 9.75% backend cycles idle (74.63%) - 7,097,199,877 instructions:u # 2.83 insn per cycle - # 0.03 stalled cycles per insn (74.72%) - 0.728264552 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.585863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592266e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592266e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.042848 sec + 2,820,748,390 cycles # 2.693 GHz + 7,038,277,049 instructions # 2.50 insn per cycle + 1.060611053 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805764e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814413e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.916917 sec + 2,479,527,909 cycles # 2.691 GHz + 6,280,728,930 instructions # 2.53 insn per cycle + 0.937569165 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.395801e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.400853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400853e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.183787 sec + 2,037,112,677 cycles # 1.714 GHz + 3,249,000,234 instructions # 1.59 insn per cycle + 1.203517458 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 122bc95d94..cfd5bd9f60 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:47:53 +DATE: 2024-01-30_05:49:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.592507e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.767585e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.767585e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.438826 sec - 1,210,310,947 cycles:u # 2.619 GHz (75.63%) - 3,071,241 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.78%) - 46,204,045 stalled-cycles-backend:u # 3.82% backend cycles idle (75.54%) - 1,612,879,644 instructions:u # 1.33 insn per cycle - # 0.03 stalled cycles per insn (74.77%) - 0.482142933 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.575134e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.304295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.304295e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.470892 sec + 1,938,590,562 cycles # 2.832 GHz + 2,932,139,577 instructions # 1.51 insn per cycle + 0.742517096 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.268741e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.711405e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.711405e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.442199 sec - 11,495,778,661 cycles:u # 3.309 GHz (74.91%) - 38,646,041 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.05%) - 1,141,934,285 stalled-cycles-backend:u # 9.93% backend cycles idle (75.05%) - 9,990,513,371 instructions:u # 0.87 insn per cycle - # 0.11 stalled cycles per insn (75.02%) - 3.496820444 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.189558e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.483327e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.483327e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.911946 sec + 6,179,624,048 cycles # 2.874 GHz + 12,701,880,125 instructions # 2.06 insn per cycle + 2.209063416 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.452635e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.453690e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.453690e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.694669 sec - 23,515,293,319 cycles:u # 3.502 GHz (74.99%) - 1,375,662 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 2,849,268,325 stalled-cycles-backend:u # 12.12% backend cycles idle (74.99%) - 75,874,654,450 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 6.718179574 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.966573e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967552e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967552e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.353267 sec + 24,210,307,332 cycles # 2.898 GHz + 75,882,231,103 instructions # 3.13 insn per cycle + 8.358202878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.888970e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.906318e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.906318e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.666375 sec - 5,877,271,334 cycles:u # 3.482 GHz (74.88%) - 717,296 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 874,212,055 stalled-cycles-backend:u # 14.87% backend cycles idle (74.88%) - 20,187,147,538 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (74.74%) - 1.691273437 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.010932e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.023878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.023878e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.350780 sec + 6,507,988,967 cycles # 2.764 GHz + 20,124,211,431 instructions # 3.09 insn per cycle + 2.355993372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.354855e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.364991e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.364991e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.704324 sec - 2,502,599,529 cycles:u # 3.449 GHz (74.65%) - 1,026,743 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.65%) - 245,383,676 stalled-cycles-backend:u # 9.81% backend cycles idle (74.65%) - 7,099,256,311 instructions:u # 2.84 insn per cycle - # 0.03 stalled cycles per insn (74.74%) - 0.729305773 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.585110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.591932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.591932e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.046222 sec + 2,830,060,229 cycles # 2.694 GHz + 7,047,238,365 instructions # 2.49 insn per cycle + 1.051506977 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805765e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814390e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814390e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.919776 sec + 2,488,595,721 cycles # 2.693 GHz + 6,289,461,030 instructions # 2.53 insn per cycle + 0.925186931 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.390787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.395884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.191044 sec + 2,045,888,825 cycles # 1.712 GHz + 3,258,286,239 instructions # 1.59 insn per cycle + 1.196330024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 6f501681db..18818d76f2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:56:58 +DATE: 2024-01-30_06:02:02 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.567605e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.783158e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.783801e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.433304 sec - 1,203,789,126 cycles:u # 2.646 GHz (75.11%) - 2,876,739 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.13%) - 47,664,150 stalled-cycles-backend:u # 3.96% backend cycles idle (75.36%) - 1,559,004,535 instructions:u # 1.30 insn per cycle - # 0.03 stalled cycles per insn (75.65%) - 0.473764291 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.319163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.372298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.378244e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.470285 sec + 1,953,187,910 cycles # 2.826 GHz + 2,879,626,230 instructions # 1.47 insn per cycle + 0.750725256 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.694456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.725395e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.725816e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.304741 sec - 11,130,877,093 cycles:u # 3.339 GHz (74.92%) - 27,954,603 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.09%) - 1,134,042,955 stalled-cycles-backend:u # 10.19% backend cycles idle (75.10%) - 9,015,184,927 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.03%) - 3.353523598 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.571852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.645768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649137e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.807519 sec + 5,850,952,354 cycles # 2.861 GHz + 11,909,032,858 instructions # 2.04 insn per cycle + 2.113707665 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.452340e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.453385e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.453385e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.964186e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.965186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965186e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.693555 sec - 23,512,441,331 cycles:u # 3.502 GHz (74.98%) - 1,325,303 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 2,795,705,162 stalled-cycles-backend:u # 11.89% backend cycles idle (74.98%) - 75,871,702,156 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 6.716315041 seconds time elapsed +TOTAL : 8.362848 sec + 24,219,340,843 cycles # 2.896 GHz + 75,878,803,024 instructions # 3.13 insn per cycle + 8.367752014 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.836691e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.854293e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.854293e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.673211 sec - 5,901,611,042 cycles:u # 3.484 GHz (74.97%) - 2,928,497 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) - 890,340,098 stalled-cycles-backend:u # 15.09% backend cycles idle (74.98%) - 20,135,891,180 instructions:u # 3.41 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 1.695783923 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.106063e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.119817e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.119817e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 2.317534 sec + 6,502,161,706 cycles # 2.801 GHz + 20,113,148,136 instructions # 3.09 insn per cycle + 2.322610994 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.349784e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359876e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359876e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.703938 sec - 2,503,013,691 cycles:u # 3.454 GHz (74.62%) - 1,047,319 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.37%) - 252,615,998 stalled-cycles-backend:u # 10.09% backend cycles idle (74.38%) - 7,084,595,149 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (75.18%) - 0.726588217 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.586948e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.593562e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.593562e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 1.043186 sec + 2,822,730,977 cycles # 2.696 GHz + 7,035,059,102 instructions # 2.49 insn per cycle + 1.048122577 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.807119e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816011e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816011e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.917444 sec + 2,481,419,746 cycles # 2.693 GHz + 6,275,834,953 instructions # 2.53 insn per cycle + 0.922842065 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.399447e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.404609e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.404609e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 1.182503 sec + 2,042,245,375 cycles # 1.722 GHz + 3,246,419,225 instructions # 1.59 insn per cycle + 1.187753193 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 806e2754e6..e0bdb664e1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,143 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:54:33 +DATE: 2024-01-30_05:58:32 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 53,369,864 cycles:u # 2.565 GHz (61.58%) - 40,139 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.59%) - 642,753 stalled-cycles-backend:u # 1.20% backend cycles idle (61.59%) - 43,640,815 instructions:u # 0.82 insn per cycle - # 0.01 stalled cycles per insn (63.54%) - 0.021718512 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.316613e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.368482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.375052e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.466730 sec + 1,919,349,851 cycles # 2.829 GHz + 2,893,848,641 instructions # 1.51 insn per cycle + 0.736177730 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 46,067,681 cycles:u # 2.248 GHz (60.99%) - 58,322 stalled-cycles-frontend:u # 0.13% frontend cycles idle (60.99%) - 517,074 stalled-cycles-backend:u # 1.12% backend cycles idle (60.99%) - 47,601,779 instructions:u # 1.03 insn per cycle - # 0.01 stalled cycles per insn (68.07%) - 0.021333955 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.573214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.650131e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.756106 sec + 5,695,142,145 cycles # 2.868 GHz + 11,326,470,226 instructions # 1.99 insn per cycle + 2.046387591 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted - 50,181,869 cycles:u # 2.416 GHz (61.53%) - 45,388 stalled-cycles-frontend:u # 0.09% frontend cycles idle (61.53%) - 590,959 stalled-cycles-backend:u # 1.18% backend cycles idle (61.53%) - 45,782,293 instructions:u # 0.91 insn per cycle - # 0.01 stalled cycles per insn (63.57%) - 0.024908860 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.965319e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.966317e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.966317e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.353416 sec + 24,206,918,909 cycles # 2.897 GHz + 75,878,282,077 instructions # 3.13 insn per cycle + 8.358253425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted - 39,368,932 cycles:u # 1.923 GHz (60.95%) - 59,541 stalled-cycles-frontend:u # 0.15% frontend cycles idle (60.95%) - 360,551 stalled-cycles-backend:u # 0.92% backend cycles idle (56.80%) - 48,866,933 instructions:u # 1.24 insn per cycle - # 0.01 stalled cycles per insn (76.32%) - 0.021733591 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.994720e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007761e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007761e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.353332 sec + 6,524,875,303 cycles # 2.768 GHz + 20,114,868,262 instructions # 3.08 insn per cycle + 2.358279130 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted - 54,509,603 cycles:u # 2.643 GHz (61.24%) - 45,858 stalled-cycles-frontend:u # 0.08% frontend cycles idle (61.25%) - 657,609 stalled-cycles-backend:u # 1.21% backend cycles idle (61.25%) - 42,090,677 instructions:u # 0.77 insn per cycle - # 0.02 stalled cycles per insn (63.01%) - 0.021908360 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.578556e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.585147e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.585147e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.047733 sec + 2,820,818,870 cycles # 2.682 GHz + 7,037,506,961 instructions # 2.49 insn per cycle + 1.053002937 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.765542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.773827e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.773827e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.937560 sec + 2,478,872,591 cycles # 2.633 GHz + 6,279,446,291 instructions # 2.53 insn per cycle + 0.942558881 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.394421e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.399630e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.399630e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.184942 sec + 2,037,351,256 cycles # 1.714 GHz + 3,247,924,134 instructions # 1.59 insn per cycle + 1.189828303 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 8f4bea55a8..d4941d3986 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,181 +1,226 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:53:05 +DATE: 2024-01-30_05:55:07 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.594064e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.762645e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.763306e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.433891 sec - 1,224,430,513 cycles:u # 2.667 GHz (75.41%) - 2,966,487 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.62%) - 46,035,582 stalled-cycles-backend:u # 3.76% backend cycles idle (75.63%) - 1,580,446,250 instructions:u # 1.29 insn per cycle - # 0.03 stalled cycles per insn (75.55%) - 0.473902530 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.730552e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.395791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401561e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.472770 sec + 1,942,039,449 cycles # 2.839 GHz + 2,914,569,721 instructions # 1.50 insn per cycle + 0.744083634 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.299457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.729068e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.729503e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.402827 sec - 11,510,047,857 cycles:u # 3.351 GHz (74.82%) - 38,841,732 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.05%) - 1,144,379,586 stalled-cycles-backend:u # 9.94% backend cycles idle (75.08%) - 9,781,410,998 instructions:u # 0.85 insn per cycle - # 0.12 stalled cycles per insn (75.20%) - 3.452964103 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.426812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.621213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.624728e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.841459 sec + 5,951,272,102 cycles # 2.874 GHz + 12,317,260,326 instructions # 2.07 insn per cycle + 2.133121829 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.454669e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.455720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.455720e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.687141 sec - 23,500,859,274 cycles:u # 3.504 GHz (74.96%) - 1,332,192 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,792,659,646 stalled-cycles-backend:u # 11.88% backend cycles idle (74.96%) - 75,931,914,319 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 6.709819182 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.960613e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961562e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961562e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.374119 sec + 24,216,955,817 cycles # 2.891 GHz + 75,878,033,044 instructions # 3.13 insn per cycle + 8.378947710 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.891791e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.909604e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.909604e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.664190 sec - 5,869,351,508 cycles:u # 3.483 GHz (74.84%) - 796,441 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) - 858,273,259 stalled-cycles-backend:u # 14.62% backend cycles idle (74.74%) - 20,185,000,449 instructions:u # 3.44 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 1.687127609 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.136107e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.149132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.149132e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.306790 sec + 6,504,696,579 cycles # 2.815 GHz + 20,114,676,918 instructions # 3.09 insn per cycle + 2.311724672 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.353673e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.363916e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.363916e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.702805 sec - 2,501,986,684 cycles:u # 3.458 GHz (74.58%) - 1,043,648 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.58%) - 247,119,630 stalled-cycles-backend:u # 9.88% backend cycles idle (74.63%) - 7,101,537,921 instructions:u # 2.84 insn per cycle - # 0.03 stalled cycles per insn (74.88%) - 0.725376097 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.585387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592052e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.043340 sec + 2,821,286,489 cycles # 2.694 GHz + 7,037,435,358 instructions # 2.49 insn per cycle + 1.048505999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.743919e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.751789e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.751789e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.949190 sec + 2,568,265,414 cycles # 2.694 GHz + 6,279,620,229 instructions # 2.45 insn per cycle + 0.954345697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.404393e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.409463e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409463e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.176805 sec + 2,037,562,738 cycles # 1.726 GHz + 3,247,895,210 instructions # 1.59 insn per cycle + 1.182054069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index dadb2ed2c2..391ab3d24f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:14:51 +DATE: 2024-01-30_05:02:36 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.550772e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.750895e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.752340e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.434974 sec - 1,204,988,361 cycles:u # 2.634 GHz (75.54%) - 3,007,154 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.35%) - 50,869,111 stalled-cycles-backend:u # 4.22% backend cycles idle (74.85%) - 1,617,036,515 instructions:u # 1.34 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 0.476981080 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.280133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.331305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.337921e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.487324 sec + 1,984,658,948 cycles # 2.819 GHz + 2,919,152,547 instructions # 1.47 insn per cycle + 0.799038148 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.724634e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.755120e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.755557e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.286109 sec - 11,097,219,386 cycles:u # 3.349 GHz (74.96%) - 27,923,365 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.90%) - 1,135,219,349 stalled-cycles-backend:u # 10.23% backend cycles idle (74.93%) - 8,949,546,871 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.10%) - 3.333916008 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.572518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.647175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.650566e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.731714 sec + 5,664,416,860 cycles # 2.869 GHz + 11,423,818,247 instructions # 2.02 insn per cycle + 2.033192051 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.450904e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.451951e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.451951e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.697264 sec - 23,515,902,426 cycles:u # 3.501 GHz (74.99%) - 1,383,417 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 2,323,931,551 stalled-cycles-backend:u # 9.88% backend cycles idle (75.00%) - 75,819,267,441 instructions:u # 3.22 insn per cycle - # 0.03 stalled cycles per insn (75.00%) - 6.720228011 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.928583e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929543e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929543e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.512607 sec + 24,191,141,745 cycles # 2.843 GHz + 75,807,282,467 instructions # 3.13 insn per cycle + 8.524714483 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866108667618E-004 -Relative difference = 5.871505118544242e-08 +Avg ME (F77/C++) = 6.6274870430095556E-004 +Relative difference = 6.489572191632735e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.891200e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.908824e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.908824e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.663830 sec - 5,882,752,910 cycles:u # 3.491 GHz (74.93%) - 765,449 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) - 839,987,494 stalled-cycles-backend:u # 14.28% backend cycles idle (74.84%) - 20,180,285,105 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (74.67%) - 1.688465440 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.113368e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.126874e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126874e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.313934 sec + 6,500,918,155 cycles # 2.804 GHz + 20,111,364,543 instructions # 3.09 insn per cycle + 2.332783497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.361789e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.371991e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.371991e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.700199 sec - 2,498,969,011 cycles:u # 3.464 GHz (74.61%) - 491,381 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.50%) - 310,187,854 stalled-cycles-backend:u # 12.41% backend cycles idle (74.23%) - 7,093,146,490 instructions:u # 2.84 insn per cycle - # 0.04 stalled cycles per insn (74.80%) - 0.724555225 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.589760e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.596530e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.596530e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.040223 sec + 2,815,442,217 cycles # 2.695 GHz + 7,038,519,370 instructions # 2.50 insn per cycle + 1.057514134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.751311e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.759469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.759469e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.945030 sec + 2,478,506,957 cycles # 2.610 GHz + 6,280,796,881 instructions # 2.53 insn per cycle + 0.988336476 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.386273e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.391271e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.391271e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.191730 sec + 2,039,311,665 cycles # 1.704 GHz + 3,248,072,614 instructions # 1.59 insn per cycle + 1.208300824 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 60de9f177c..77eae3ae9c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:35:40 +DATE: 2024-01-30_05:40:00 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.579542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.779793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781143e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.431830 sec - 1,206,052,215 cycles:u # 2.657 GHz (75.12%) - 2,932,842 stalled-cycles-frontend:u # 0.24% frontend cycles idle (75.13%) - 51,275,589 stalled-cycles-backend:u # 4.25% backend cycles idle (75.31%) - 1,613,169,701 instructions:u # 1.34 insn per cycle - # 0.03 stalled cycles per insn (75.27%) - 0.476903668 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.547321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593359e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.493771 sec + 2,067,778,848 cycles # 2.808 GHz + 3,079,367,454 instructions # 1.49 insn per cycle + 0.793803934 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.695239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.728316e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.728742e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.295663 sec - 11,138,698,712 cycles:u # 3.352 GHz (74.98%) - 27,875,365 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.98%) - 1,139,078,359 stalled-cycles-backend:u # 10.23% backend cycles idle (74.98%) - 9,010,928,708 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (74.95%) - 3.341936107 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.730139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.790957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.793762e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.861027 sec + 6,035,258,548 cycles # 2.873 GHz + 13,088,532,370 instructions # 2.17 insn per cycle + 2.157681364 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 6.256291e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.256974e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.256974e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 26.220009 sec - 91,948,371,418 cycles:u # 3.504 GHz (75.00%) - 504,800,131 stalled-cycles-frontend:u # 0.55% frontend cycles idle (75.00%) - 5,921,064,920 stalled-cycles-backend:u # 6.44% backend cycles idle (75.00%) - 134,071,955,136 instructions:u # 1.46 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 26.243295454 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.418156e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.418889e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.418889e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 30.278912 sec + 87,193,893,967 cycles # 2.880 GHz + 133,999,567,781 instructions # 1.54 insn per cycle + 30.284052553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340697351248E-004 -Relative difference = 1.052203199451665e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275354356437610E-004 +Relative difference = 6.573239683366044e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.174878e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.186794e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.186794e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 2.012411 sec - 7,095,681,533 cycles:u # 3.489 GHz (74.83%) - 5,456,142 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.83%) - 3,177,706,174 stalled-cycles-backend:u # 44.78% backend cycles idle (74.86%) - 19,220,540,740 instructions:u # 2.71 insn per cycle - # 0.17 stalled cycles per insn (75.01%) - 2.037081095 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.858617e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.871131e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.871131e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.400232 sec + 6,719,203,240 cycles # 2.795 GHz + 19,163,412,782 instructions # 2.85 insn per cycle + 2.405407499 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857053714997E-004 -Relative difference = 4.445554471174176e-08 +Avg ME (F77/C++) = 6.6274859783433532E-004 +Relative difference = 3.2677016209485094e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.464288e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.468259e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.468259e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.126244 sec - 3,986,800,812 cycles:u # 3.474 GHz (74.91%) - 619,881 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.91%) - 2,256,644,385 stalled-cycles-backend:u # 56.60% backend cycles idle (74.91%) - 6,771,749,492 instructions:u # 1.70 insn per cycle - # 0.33 stalled cycles per insn (74.92%) - 1.150735594 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.418642e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423893e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423893e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.168526 sec + 3,140,858,608 cycles # 2.683 GHz + 6,747,205,943 instructions # 2.15 insn per cycle + 1.173847287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735722101156E-004 -Relative difference = 6.454990161554483e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.703185e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.710717e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.710717e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.972109 sec + 2,610,520,883 cycles # 2.675 GHz + 5,931,408,487 instructions # 2.27 insn per cycle + 0.977161465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.380375e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.385342e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.385342e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.197476 sec + 2,050,152,648 cycles # 1.706 GHz + 3,435,996,672 instructions # 1.68 insn per cycle + 1.202741015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272748295826550E-004 +Relative difference = 2.5714542480216212e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 02fee90070..0e738d355a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:36:24 +DATE: 2024-01-30_05:40:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.572650e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.765978e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.767488e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.434017 sec - 1,239,819,457 cycles:u # 2.720 GHz (73.81%) - 2,801,408 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.26%) - 40,578,551 stalled-cycles-backend:u # 3.27% backend cycles idle (75.96%) - 1,570,168,333 instructions:u # 1.27 insn per cycle - # 0.03 stalled cycles per insn (76.03%) - 0.476665475 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.495403e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.535166e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.540654e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.492553 sec + 2,044,553,803 cycles # 2.834 GHz + 3,023,997,415 instructions # 1.48 insn per cycle + 0.781011529 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.722157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.753832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.754275e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.288731 sec - 11,072,404,348 cycles:u # 3.338 GHz (74.93%) - 27,707,536 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.14%) - 1,125,717,462 stalled-cycles-backend:u # 10.17% backend cycles idle (75.14%) - 8,983,411,192 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.04%) - 3.337368964 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.639095e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.697524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.700186e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.866900 sec + 6,069,227,607 cycles # 2.871 GHz + 11,631,061,560 instructions # 1.92 insn per cycle + 2.173672620 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 6.152606e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.153268e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.153268e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 26.661499 sec - 93,493,285,356 cycles:u # 3.504 GHz (75.00%) - 446,484,923 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.00%) - 6,118,715,750 stalled-cycles-backend:u # 6.54% backend cycles idle (75.00%) - 133,974,562,362 instructions:u # 1.43 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 26.684773403 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.528053e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.528817e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.528817e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 29.676151 sec + 85,692,453,161 cycles # 2.888 GHz + 134,120,579,675 instructions # 1.57 insn per cycle + 29.681167734 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275346486299042E-004 -Relative difference = 5.301670926116898e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627536e-04 +Avg ME (F77/C++) = 6.6275357377482830E-004 +Relative difference = 3.95700176737784e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.264797e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.277274e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.277274e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.990366 sec - 7,025,594,276 cycles:u # 3.493 GHz (75.03%) - 1,878,173 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.95%) - 3,085,704,467 stalled-cycles-backend:u # 43.92% backend cycles idle (74.95%) - 19,241,600,987 instructions:u # 2.74 insn per cycle - # 0.16 stalled cycles per insn (74.95%) - 2.014940966 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.924333e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.936823e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.936823e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.377362 sec + 6,721,293,685 cycles # 2.823 GHz + 19,223,635,236 instructions # 2.86 insn per cycle + 2.382317911 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857044990032E-004 -Relative difference = 4.4587192899226015e-08 +Avg ME (F77/C++) = 6.6274859765498573E-004 +Relative difference = 3.538316437387639e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.497226e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.501331e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.501331e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.101382 sec - 3,919,935,527 cycles:u # 3.491 GHz (74.81%) - 1,339,023 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.99%) - 2,195,359,455 stalled-cycles-backend:u # 56.00% backend cycles idle (75.07%) - 6,707,924,892 instructions:u # 1.71 insn per cycle - # 0.33 stalled cycles per insn (75.07%) - 1.125989131 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.449646e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.455242e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.455242e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.140025 sec + 3,079,658,771 cycles # 2.692 GHz + 6,686,222,708 instructions # 2.17 insn per cycle + 1.145080651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735755491807E-004 -Relative difference = 6.404606472340801e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.717993e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.725785e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.725785e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.963197 sec + 2,607,305,399 cycles # 2.696 GHz + 5,935,632,787 instructions # 2.28 insn per cycle + 0.968307475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.382587e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387561e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387561e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.195178 sec + 2,050,651,524 cycles # 1.710 GHz + 3,422,960,187 instructions # 1.67 insn per cycle + 1.200266882 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272749650985591E-004 +Relative difference = 5.26633351741962e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 639a7fabfa..7714401e20 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:15:13 +DATE: 2024-01-30_05:03:07 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.400495e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571912e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.573006e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.644272 sec - 1,957,881,062 cycles:u # 2.941 GHz (74.75%) - 2,293,474 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.75%) - 42,495,223 stalled-cycles-backend:u # 2.17% backend cycles idle (74.69%) - 2,152,425,070 instructions:u # 1.10 insn per cycle - # 0.02 stalled cycles per insn (74.74%) - 0.688907615 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.456900e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.484722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487399e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.528831 sec + 2,192,645,567 cycles # 2.833 GHz + 3,378,106,633 instructions # 1.54 insn per cycle + 0.861052908 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.243610e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.246656e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.246717e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.369690 sec - 28,827,159,075 cycles:u # 3.432 GHz (74.95%) - 11,635,842 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.97%) - 1,119,173,150 stalled-cycles-backend:u # 3.88% backend cycles idle (75.04%) - 22,599,867,625 instructions:u # 0.78 insn per cycle - # 0.05 stalled cycles per insn (75.05%) - 8.420766112 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.113905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.147620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.149017e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.049267 sec + 9,507,642,735 cycles # 2.871 GHz + 19,066,132,971 instructions # 2.01 insn per cycle + 3.371164553 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.173414e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.174272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.174272e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.553360 sec - 26,525,384,563 cycles:u # 3.502 GHz (74.97%) - 31,700,879 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.97%) - 3,907,005,284 stalled-cycles-backend:u # 14.73% backend cycles idle (74.97%) - 82,493,224,456 instructions:u # 3.11 insn per cycle - # 0.05 stalled cycles per insn (74.98%) - 7.576999233 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.769606e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.770411e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.770411e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.278224 sec + 26,812,823,901 cycles # 2.889 GHz + 82,462,709,559 instructions # 3.08 insn per cycle + 9.289930135 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.068024e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.072694e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.072694e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.243606 sec - 11,407,543,003 cycles:u # 3.494 GHz (75.01%) - 3,874,601 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.01%) - 1,233,532,875 stalled-cycles-backend:u # 10.81% backend cycles idle (75.01%) - 38,549,117,575 instructions:u # 3.38 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 3.268130380 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.509625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512894e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.686363 sec + 12,638,766,565 cycles # 2.696 GHz + 38,538,047,706 instructions # 3.05 insn per cycle + 4.708715306 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.212166e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.214786e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.214786e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.360592 sec - 4,822,632,005 cycles:u # 3.490 GHz (74.69%) - 1,235,278 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.89%) - 582,075,803 stalled-cycles-backend:u # 12.07% backend cycles idle (75.11%) - 13,597,926,852 instructions:u # 2.82 insn per cycle - # 0.04 stalled cycles per insn (75.11%) - 1.385071998 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.005037e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.021640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.021640e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.058850 sec + 5,538,789,085 cycles # 2.684 GHz + 13,583,257,196 instructions # 2.45 insn per cycle + 2.079297542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.175649e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.196938e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.196938e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.797590 sec + 4,843,535,516 cycles # 2.687 GHz + 12,110,039,110 instructions # 2.50 insn per cycle + 1.813279758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.862805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.874864e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.874864e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.399875 sec + 4,096,013,404 cycles # 1.704 GHz + 6,283,624,620 instructions # 1.53 insn per cycle + 2.418716991 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 69a8ae3caf..9cdb5ea5b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-28_13:15:44 +DATE: 2024-01-30_05:03:46 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.393338e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.453086e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.453532e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.527403 sec - 1,560,784,590 cycles:u # 2.845 GHz (75.01%) - 2,427,471 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.23%) - 33,307,693 stalled-cycles-backend:u # 2.13% backend cycles idle (75.23%) - 1,859,467,187 instructions:u # 1.19 insn per cycle - # 0.02 stalled cycles per insn (75.12%) - 0.568964990 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.463401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.491582e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494105e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.528345 sec + 2,191,366,155 cycles # 2.835 GHz + 3,376,981,873 instructions # 1.54 insn per cycle + 0.868249282 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.734585e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.740297e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.740406e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 7.037365 sec - 24,134,509,336 cycles:u # 3.414 GHz (74.99%) - 11,447,435 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.02%) - 1,098,766,355 stalled-cycles-backend:u # 4.55% backend cycles idle (75.02%) - 19,073,912,349 instructions:u # 0.79 insn per cycle - # 0.06 stalled cycles per insn (74.99%) - 7.089527013 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.141311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175321e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176779e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.034775 sec + 9,461,569,572 cycles # 2.871 GHz + 21,570,730,622 instructions # 2.28 insn per cycle + 3.354365055 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.198791e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.199654e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.199654e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.465950 sec - 26,214,724,863 cycles:u # 3.501 GHz (75.00%) - 9,362,926 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 3,471,769,514 stalled-cycles-backend:u # 13.24% backend cycles idle (75.00%) - 82,354,955,326 instructions:u # 3.14 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 7.489806720 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.763986e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.764820e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.764820e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.308542 sec + 26,818,191,963 cycles # 2.880 GHz + 82,362,969,124 instructions # 3.07 insn per cycle + 9.331807277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.054822e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.059451e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.059451e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.251859 sec - 11,458,443,041 cycles:u # 3.501 GHz (74.79%) - 5,136,562 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) - 1,386,600,576 stalled-cycles-backend:u # 12.10% backend cycles idle (75.07%) - 38,564,688,924 instructions:u # 3.37 insn per cycle - # 0.04 stalled cycles per insn (75.07%) - 3.276523087 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.494755e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497969e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497969e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.702744 sec + 12,651,856,685 cycles # 2.688 GHz + 38,557,643,348 instructions # 3.05 insn per cycle + 4.723006762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.217378e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220027e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.220027e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.354625 sec - 4,781,705,033 cycles:u # 3.475 GHz (75.00%) - 852,815 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 503,647,282 stalled-cycles-backend:u # 10.53% backend cycles idle (75.00%) - 13,628,789,014 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 1.378976688 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.057026e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.073448e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.073448e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.045356 sec + 5,503,322,263 cycles # 2.685 GHz + 13,599,131,001 instructions # 2.47 insn per cycle + 2.065937163 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.173965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.195231e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.195231e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.797623 sec + 4,836,406,491 cycles # 2.684 GHz + 12,123,840,407 instructions # 2.51 insn per cycle + 1.816744592 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.872297e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.884618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.884618e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.396265 sec + 4,088,419,794 cycles # 1.703 GHz + 6,289,480,909 instructions # 1.54 insn per cycle + 2.414194012 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index f084da33da..10dc25694a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:17:07 +DATE: 2024-01-30_05:06:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.216667e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.222480e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.222544e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.245166 sec - 32,060,830,034 cycles:u # 3.459 GHz (74.96%) - 3,605,306 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 7,624,427 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) - 25,314,101,052 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 9.290369416 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.064289e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.064686e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.064874e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.459219 sec + 7,914,579,350 cycles # 2.876 GHz + 17,414,362,649 instructions # 2.20 insn per cycle + 2.856648920 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.552819e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.556593e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.556628e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 8.972202 sec - 31,074,350,767 cycles:u # 3.455 GHz (75.00%) - 3,811,519 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 47,730,762 stalled-cycles-backend:u # 0.15% backend cycles idle (74.97%) - 24,585,836,125 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 9.016806724 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.261836e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.264181e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.264456e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.001930 sec + 12,466,660,301 cycles # 2.881 GHz + 28,598,806,424 instructions # 2.29 insn per cycle + 4.385332309 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.024242e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.024269e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.024269e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.156616 sec - 18,118,586,594 cycles:u # 3.500 GHz (74.97%) - 30,638,307 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.97%) - 2,178,310,061 stalled-cycles-backend:u # 12.02% backend cycles idle (74.97%) - 55,166,446,419 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 5.179757033 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.667093e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.667308e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.667308e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.887821 sec + 18,997,365,246 cycles # 2.759 GHz + 55,182,817,229 instructions # 2.90 insn per cycle + 6.894930966 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.218638e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.218764e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.218764e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.381357 sec - 8,398,634,016 cycles:u # 3.496 GHz (75.00%) - 2,133,753 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.03%) - 856,109,149 stalled-cycles-backend:u # 10.19% backend cycles idle (75.03%) - 27,064,784,541 instructions:u # 3.22 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 2.405524535 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.565125e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565211e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565211e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.384131 sec + 9,789,568,447 cycles # 2.893 GHz + 27,057,217,068 instructions # 2.76 insn per cycle + 3.398188002 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.162229e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.162884e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.162884e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.024290 sec - 3,644,361,131 cycles:u # 3.487 GHz (74.90%) - 1,405,334 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.75%) - 329,012,774 stalled-cycles-backend:u # 9.03% backend cycles idle (74.75%) - 9,607,967,992 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.75%) - 1.048659292 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.331784e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332213e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332213e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.593056 sec + 4,251,132,724 cycles # 2.667 GHz + 9,566,982,441 instructions # 2.25 insn per cycle + 1.603318722 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.782288e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.782847e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.782847e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.405349 sec + 3,719,980,949 cycles # 2.646 GHz + 8,451,730,597 instructions # 2.27 insn per cycle + 1.418908281 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.332107e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332611e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332611e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.593467 sec + 2,690,971,905 cycles # 1.687 GHz + 4,249,909,932 instructions # 1.58 insn per cycle + 1.609272621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index fc839a4e6f..14598d99fd 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:48:15 +DATE: 2024-01-30_05:50:15 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.131548e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.132311e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.132311e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.335186 sec - 32,340,592,683 cycles:u # 3.457 GHz (74.97%) - 3,545,492 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,890,548 stalled-cycles-backend:u # 0.02% backend cycles idle (75.03%) - 25,521,625,842 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.383613134 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.062580e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.063573e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063573e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.393250 sec + 7,805,787,223 cycles # 2.878 GHz + 17,759,546,689 instructions # 2.28 insn per cycle + 2.771767839 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.563489e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.567198e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.567198e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 8.943439 sec - 30,975,452,744 cycles:u # 3.454 GHz (74.91%) - 4,035,364 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 50,378,747 stalled-cycles-backend:u # 0.16% backend cycles idle (75.06%) - 24,492,244,124 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 8.988581838 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.205412e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.241153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.241153e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.999133 sec + 12,487,046,648 cycles # 2.887 GHz + 29,181,392,973 instructions # 2.34 insn per cycle + 4.379902707 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.018492e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.018519e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.018519e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.185948 sec - 18,221,133,254 cycles:u # 3.500 GHz (74.96%) - 32,518,833 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.96%) - 2,083,666,753 stalled-cycles-backend:u # 11.44% backend cycles idle (74.96%) - 55,214,432,417 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 5.209020766 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.924049e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.924280e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.924280e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.668799 sec + 18,978,883,548 cycles # 2.845 GHz + 55,181,310,686 instructions # 2.91 insn per cycle + 6.673958990 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.240039e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.240165e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.240165e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.358616 sec - 8,325,714,841 cycles:u # 3.499 GHz (74.72%) - 1,164,186 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.88%) - 804,408,576 stalled-cycles-backend:u # 9.66% backend cycles idle (75.13%) - 27,068,505,907 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.13%) - 2.382968221 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.558442e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558530e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558530e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.394020 sec + 9,815,752,501 cycles # 2.889 GHz + 27,056,612,659 instructions # 2.76 insn per cycle + 3.399148950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.196627e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.197309e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.197309e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.017831 sec - 3,611,820,548 cycles:u # 3.477 GHz (74.59%) - 1,044,871 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.65%) - 294,792,388 stalled-cycles-backend:u # 8.16% backend cycles idle (74.90%) - 9,609,700,878 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (75.23%) - 1.042164219 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.345002e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345461e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345461e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.587609 sec + 4,248,692,453 cycles # 2.674 GHz + 9,567,437,136 instructions # 2.25 insn per cycle + 1.592590793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.873515e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874138e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874138e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.369431 sec + 3,692,449,005 cycles # 2.689 GHz + 8,450,968,058 instructions # 2.29 insn per cycle + 1.374284426 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.369341e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.369854e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.369854e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.574041 sec + 2,686,211,452 cycles # 1.702 GHz + 4,249,274,815 instructions # 1.58 insn per cycle + 1.579137128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 6421475a26..869fccfa2f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:18:41 +DATE: 2024-01-30_05:07:21 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.166030e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.172178e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.172267e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.270503 sec - 32,155,147,021 cycles:u # 3.460 GHz (74.94%) - 3,651,238 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 8,764,172 stalled-cycles-backend:u # 0.03% backend cycles idle (74.99%) - 25,360,799,852 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 9.315668997 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.062893e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.063296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063519e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.463696 sec + 7,904,124,830 cycles # 2.867 GHz + 17,962,469,169 instructions # 2.27 insn per cycle + 2.863348110 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.562603e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566151e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566185e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 8.944569 sec - 31,010,331,579 cycles:u # 3.458 GHz (74.92%) - 3,835,223 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 48,978,807 stalled-cycles-backend:u # 0.16% backend cycles idle (75.02%) - 24,484,773,895 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.08%) - 8.988702643 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.275434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.277655e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.278066e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.004406 sec + 12,472,043,203 cycles # 2.872 GHz + 27,476,431,943 instructions # 2.20 insn per cycle + 4.397866058 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.023285e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023312e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023312e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.161102 sec - 18,130,215,132 cycles:u # 3.499 GHz (74.99%) - 28,376,417 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.99%) - 2,222,279,148 stalled-cycles-backend:u # 12.26% backend cycles idle (74.99%) - 55,131,264,986 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 5.191594222 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.993181e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.993429e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.993429e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.616864 sec + 18,937,214,023 cycles # 2.863 GHz + 55,162,675,285 instructions # 2.91 insn per cycle + 6.624084944 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.239446e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.239571e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.239571e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.358940 sec - 8,320,304,916 cycles:u # 3.494 GHz (74.81%) - 674,914 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.85%) - 760,756,627 stalled-cycles-backend:u # 9.14% backend cycles idle (75.02%) - 27,081,649,620 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.14%) - 2.400337906 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.560244e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.560337e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.560337e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.391033 sec + 9,810,909,577 cycles # 2.891 GHz + 27,064,931,751 instructions # 2.76 insn per cycle + 3.404410372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.235519e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.236216e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.236216e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.009895 sec - 3,594,869,031 cycles:u # 3.487 GHz (74.78%) - 651,676 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.11%) - 292,999,809 stalled-cycles-backend:u # 8.15% backend cycles idle (75.17%) - 9,584,413,482 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (75.18%) - 1.033862475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.366743e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367151e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367151e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.577213 sec + 4,241,194,499 cycles # 2.687 GHz + 9,570,392,055 instructions # 2.26 insn per cycle + 1.590680511 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.823083e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.823621e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823621e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.389663 sec + 3,742,544,913 cycles # 2.690 GHz + 8,455,558,047 instructions # 2.26 insn per cycle + 1.401942381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.367545e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.368096e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.368096e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.578445 sec + 2,686,793,480 cycles # 1.702 GHz + 4,251,847,609 instructions # 1.58 insn per cycle + 1.591347897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bbce0efa88..a75bd83e48 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:20:15 +DATE: 2024-01-30_05:08:27 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.873337e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.877052e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.877089e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 -TOTAL : 4.380410 sec - 15,030,892,755 cycles:u # 3.412 GHz (75.01%) - 2,743,911 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 6,200,580 stalled-cycles-backend:u # 0.04% backend cycles idle (74.87%) - 12,235,328,339 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 4.427625896 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.769847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.770754e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.771164e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.703094 sec + 5,571,181,653 cycles # 2.867 GHz + 11,974,166,174 instructions # 2.15 insn per cycle + 2.057946232 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.374633e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.392593e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.392749e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 -TOTAL : 4.650818 sec - 15,950,335,393 cycles:u # 3.415 GHz (75.00%) - 3,205,766 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 59,561,725 stalled-cycles-backend:u # 0.37% backend cycles idle (75.03%) - 12,953,310,098 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 4.691000798 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.318486e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319261e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319430e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.904733 sec + 6,266,697,659 cycles # 2.868 GHz + 13,596,680,456 instructions # 2.17 insn per cycle + 2.241129899 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.091774e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091804e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091804e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.837783 sec - 17,016,597,288 cycles:u # 3.503 GHz (74.98%) - 15,666,867 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.97%) - 1,915,711,543 stalled-cycles-backend:u # 11.26% backend cycles idle (74.97%) - 51,809,157,108 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 4.860535262 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.651013e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651286e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651286e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.115216 sec + 17,580,950,028 cycles # 2.876 GHz + 51,788,424,956 instructions # 2.95 insn per cycle + 6.122234952 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.584820e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.585355e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.585355e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.153192 sec - 4,090,224,367 cycles:u # 3.483 GHz (74.80%) - 419,368 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.80%) - 378,967,939 stalled-cycles-backend:u # 9.27% backend cycles idle (74.80%) - 13,802,300,433 instructions:u # 3.37 insn per cycle - # 0.03 stalled cycles per insn (74.85%) - 1.177425051 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.365857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366295e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366295e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.576617 sec + 4,544,162,423 cycles # 2.878 GHz + 13,760,085,205 instructions # 3.03 insn per cycle + 1.587566374 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.028883e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029149e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029149e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.514884 sec - 1,859,416,874 cycles:u # 3.470 GHz (74.84%) - 756,642 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.63%) - 160,714,291 stalled-cycles-backend:u # 8.64% backend cycles idle (74.62%) - 4,863,962,985 instructions:u # 2.62 insn per cycle - # 0.03 stalled cycles per insn (74.62%) - 0.538920469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.652038e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.653755e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.653755e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.803941 sec + 2,147,173,176 cycles # 2.667 GHz + 4,827,637,015 instructions # 2.25 insn per cycle + 0.818354401 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.264093e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.266084e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.266084e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.735161 sec + 1,890,652,826 cycles # 2.565 GHz + 4,260,215,320 instructions # 2.25 insn per cycle + 0.752160652 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.595587e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.597618e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.597618e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.809620 sec + 1,357,631,253 cycles # 1.673 GHz + 2,149,171,041 instructions # 1.58 insn per cycle + 0.843747051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 36f73d1d8e..dd846fe890 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:49:49 +DATE: 2024-01-30_05:51:20 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.851281e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.851679e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.851679e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 -TOTAL : 4.420923 sec - 15,136,537,681 cycles:u # 3.405 GHz (75.00%) - 2,717,273 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 7,517,456 stalled-cycles-backend:u # 0.05% backend cycles idle (75.04%) - 12,304,157,182 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 4.467019241 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.783457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785575e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785575e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.618103 sec + 5,426,392,715 cycles # 2.867 GHz + 11,041,442,286 instructions # 2.03 insn per cycle + 1.951599799 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.366666e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.382282e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.382282e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 -TOTAL : 4.659656 sec - 15,995,419,535 cycles:u # 3.416 GHz (74.89%) - 3,646,164 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.97%) - 57,220,683 stalled-cycles-backend:u # 0.36% backend cycles idle (75.04%) - 12,945,746,501 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.11%) - 4.704619645 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.306053e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319762e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319762e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.925915 sec + 6,319,647,912 cycles # 2.872 GHz + 13,785,417,374 instructions # 2.18 insn per cycle + 2.259607907 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.090660e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.090691e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.090691e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.842680 sec - 17,022,170,540 cycles:u # 3.500 GHz (75.00%) - 16,161,742 stalled-cycles-frontend:u # 0.09% frontend cycles idle (75.00%) - 1,836,330,502 stalled-cycles-backend:u # 10.79% backend cycles idle (75.00%) - 51,787,562,489 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 4.865545899 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.635189e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.635467e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635467e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.119375 sec + 17,637,027,949 cycles # 2.881 GHz + 51,787,792,256 instructions # 2.94 insn per cycle + 6.124243714 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.577962e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.578491e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.578491e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.155076 sec - 4,094,436,058 cycles:u # 3.482 GHz (74.83%) - 878,794 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.83%) - 393,883,989 stalled-cycles-backend:u # 9.62% backend cycles idle (74.83%) - 13,810,704,758 instructions:u # 3.37 insn per cycle - # 0.03 stalled cycles per insn (74.73%) - 1.179224752 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.362357e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.362789e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.362789e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.576085 sec + 4,544,551,937 cycles # 2.877 GHz + 13,759,350,934 instructions # 3.03 insn per cycle + 1.581388093 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.036106e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.036381e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.036381e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.511703 sec - 1,838,496,544 cycles:u # 3.451 GHz (74.48%) - 814,316 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.48%) - 161,202,908 stalled-cycles-backend:u # 8.77% backend cycles idle (74.01%) - 4,877,499,492 instructions:u # 2.65 insn per cycle - # 0.03 stalled cycles per insn (74.77%) - 0.535953009 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.701025e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.702845e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.702845e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.794089 sec + 2,138,661,629 cycles # 2.680 GHz + 4,826,930,405 instructions # 2.26 insn per cycle + 0.798991405 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.613418e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.615510e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.615510e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.699716 sec + 1,882,009,512 cycles # 2.675 GHz + 4,259,439,384 instructions # 2.26 insn per cycle + 0.704552121 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.688489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.690546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.690546e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.795848 sec + 1,355,819,871 cycles # 1.696 GHz + 2,148,215,879 instructions # 1.58 insn per cycle + 0.800761416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index fe846c064e..90b9187b98 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:21:12 +DATE: 2024-01-30_05:09:16 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.835082e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.838474e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.838511e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 -TOTAL : 4.428832 sec - 15,205,315,081 cycles:u # 3.416 GHz (74.97%) - 2,734,544 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.02%) - 6,675,019 stalled-cycles-backend:u # 0.04% backend cycles idle (75.02%) - 12,381,921,660 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 4.474102149 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.764318e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.765250e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.765666e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.705167 sec + 5,556,067,435 cycles # 2.852 GHz + 10,985,634,618 instructions # 1.98 insn per cycle + 2.060310498 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.370419e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.388480e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.388559e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 -TOTAL : 4.639032 sec - 15,939,291,676 cycles:u # 3.420 GHz (74.96%) - 3,288,892 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) - 53,583,558 stalled-cycles-backend:u # 0.34% backend cycles idle (74.87%) - 12,929,386,951 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 4.680410352 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.344230e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345038e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345205e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.928465 sec + 6,365,812,176 cycles # 2.870 GHz + 12,742,048,160 instructions # 2.00 insn per cycle + 2.275067290 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.088738e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.088769e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.088769e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.850413 sec - 17,074,036,603 cycles:u # 3.505 GHz (74.89%) - 17,496,903 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.97%) - 1,831,911,248 stalled-cycles-backend:u # 10.73% backend cycles idle (75.04%) - 51,777,914,318 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 4.873262774 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.700294e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.700564e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700564e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.077926 sec + 17,558,502,709 cycles # 2.889 GHz + 51,759,109,121 instructions # 2.95 insn per cycle + 6.085026833 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087396841E-003 -Relative difference = 2.119623377106246e-08 +Avg ME (F77/C++) = 9.8479612087313262E-003 +Relative difference = 2.1195385077844924e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.573905e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.574445e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.574445e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.155677 sec - 4,095,922,221 cycles:u # 3.482 GHz (74.84%) - 659,909 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.84%) - 437,658,752 stalled-cycles-backend:u # 10.69% backend cycles idle (74.85%) - 13,797,253,519 instructions:u # 3.37 insn per cycle - # 0.03 stalled cycles per insn (74.74%) - 1.179554790 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.376771e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.377174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.377174e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.572289 sec + 4,548,603,521 cycles # 2.891 GHz + 13,758,604,883 instructions # 3.02 insn per cycle + 1.583710945 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.033303e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.033568e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.033568e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.512278 sec - 1,841,408,148 cycles:u # 3.454 GHz (74.50%) - 748,846 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.50%) - 156,642,601 stalled-cycles-backend:u # 8.51% backend cycles idle (74.50%) - 4,868,507,625 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.81%) - 0.536248778 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.592179e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.593820e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.593820e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.808909 sec + 2,140,416,404 cycles # 2.637 GHz + 4,826,824,873 instructions # 2.26 insn per cycle + 0.906681144 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.677326e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.679741e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.679741e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.695609 sec + 1,868,752,206 cycles # 2.678 GHz + 4,259,067,854 instructions # 2.28 insn per cycle + 0.708960929 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.775075e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.777182e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.777182e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.788693 sec + 1,354,650,321 cycles # 1.715 GHz + 2,148,091,187 instructions # 1.59 insn per cycle + 0.801177717 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index e92987a82e..4eda45e114 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:22:10 +DATE: 2024-01-30_05:10:05 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.693759e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.698928e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.698967e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.637826 sec - 33,435,746,973 cycles:u # 3.461 GHz (74.92%) - 3,516,680 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 9,721,702 stalled-cycles-backend:u # 0.03% backend cycles idle (75.00%) - 26,386,914,135 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 9.682444440 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.692959e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.693612e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.693848e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.179406 sec + 7,155,207,889 cycles # 2.861 GHz + 14,615,335,571 instructions # 2.04 insn per cycle + 2.559881855 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.328832e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332021e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332051e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.254842 sec - 32,049,610,632 cycles:u # 3.455 GHz (75.00%) - 3,777,778 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 47,547,022 stalled-cycles-backend:u # 0.15% backend cycles idle (74.99%) - 25,349,562,849 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 9.296621371 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.111470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111782e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111825e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.413893 sec + 10,746,707,284 cycles # 2.875 GHz + 23,674,149,917 instructions # 2.20 insn per cycle + 3.796927749 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.015381e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.015408e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.015408e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.201480 sec - 18,286,175,540 cycles:u # 3.502 GHz (74.98%) - 33,801,798 stalled-cycles-frontend:u # 0.18% frontend cycles idle (75.03%) - 2,216,780,930 stalled-cycles-backend:u # 12.12% backend cycles idle (75.03%) - 55,369,852,802 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 5.224195387 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.884803e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.885022e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.885022e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.704107 sec + 19,257,123,030 cycles # 2.874 GHz + 55,394,447,460 instructions # 2.88 insn per cycle + 6.709385430 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.348883e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.349021e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.349021e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.249693 sec - 7,932,943,263 cycles:u # 3.494 GHz (74.99%) - 565,471 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 777,441,520 stalled-cycles-backend:u # 9.80% backend cycles idle (74.99%) - 25,879,995,949 instructions:u # 3.26 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 2.273845421 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.509946e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.510039e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510039e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.502177 sec + 9,384,694,038 cycles # 2.677 GHz + 25,874,743,625 instructions # 2.76 insn per cycle + 3.507349921 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.486925e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.487671e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.487671e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.964011 sec - 3,425,216,696 cycles:u # 3.478 GHz (74.83%) - 1,589,039 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.83%) - 294,874,215 stalled-cycles-backend:u # 8.61% backend cycles idle (74.83%) - 9,130,471,265 instructions:u # 2.67 insn per cycle - # 0.03 stalled cycles per insn (74.83%) - 0.988073640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.557555e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558062e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.558062e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.490188 sec + 4,000,749,453 cycles # 2.678 GHz + 9,119,038,902 instructions # 2.28 insn per cycle + 1.495279789 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.057405e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058069e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058069e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.307627 sec + 3,513,640,690 cycles # 2.679 GHz + 8,029,011,845 instructions # 2.29 insn per cycle + 1.312711431 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.350506e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351010e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351010e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.581908 sec + 2,606,864,065 cycles # 1.673 GHz + 4,077,382,976 instructions # 1.56 insn per cycle + 1.587144818 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index ac7918cccb..328b61834e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-28_13:23:45 +DATE: 2024-01-30_05:11:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.823155e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.829000e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.829089e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.511767 sec - 32,984,403,917 cycles:u # 3.459 GHz (74.98%) - 3,473,100 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 7,692,372 stalled-cycles-backend:u # 0.02% backend cycles idle (75.00%) - 26,075,654,881 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 9.559020948 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.684370e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.684951e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.685153e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.181079 sec + 7,148,088,261 cycles # 2.853 GHz + 14,239,530,947 instructions # 1.99 insn per cycle + 2.562146879 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.339492e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.347850e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.347878e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.263600 sec - 32,074,175,723 cycles:u # 3.457 GHz (74.96%) - 3,798,204 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 47,349,192 stalled-cycles-backend:u # 0.15% backend cycles idle (74.99%) - 25,336,600,811 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 9.309754183 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.111591e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111956e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.413150 sec + 10,755,861,454 cycles # 2.876 GHz + 23,518,245,564 instructions # 2.19 insn per cycle + 3.796500341 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.024597e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.024625e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.024625e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.154454 sec - 18,118,041,561 cycles:u # 3.501 GHz (74.96%) - 27,127,211 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.96%) - 2,172,862,275 stalled-cycles-backend:u # 11.99% backend cycles idle (74.96%) - 55,424,029,564 instructions:u # 3.06 insn per cycle - # 0.04 stalled cycles per insn (74.97%) - 5.177178107 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.912565e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.912803e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.912803e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.680088 sec + 19,228,329,737 cycles # 2.877 GHz + 55,419,296,273 instructions # 2.88 insn per cycle + 6.685533383 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.352094e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.352235e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.352235e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.246098 sec - 7,928,069,177 cycles:u # 3.497 GHz (75.01%) - 1,035,429 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 811,803,832 stalled-cycles-backend:u # 10.24% backend cycles idle (74.95%) - 25,862,084,099 instructions:u # 3.26 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 2.270192131 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.515454e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.515537e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.515537e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.490021 sec + 9,348,051,078 cycles # 2.676 GHz + 25,823,110,897 instructions # 2.76 insn per cycle + 3.495053121 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.488000e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.488738e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.488738e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.963504 sec - 3,426,340,904 cycles:u # 3.481 GHz (74.81%) - 1,278,872 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.81%) - 300,861,366 stalled-cycles-backend:u # 8.78% backend cycles idle (74.81%) - 9,116,212,703 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (74.82%) - 0.987905106 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.556805e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557285e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557285e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.490221 sec + 4,003,060,439 cycles # 2.680 GHz + 9,098,942,911 instructions # 2.27 insn per cycle + 1.495311791 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.083203e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.083821e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.083821e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.299137 sec + 3,488,850,980 cycles # 2.678 GHz + 8,010,474,997 instructions # 2.30 insn per cycle + 1.304443015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.440905e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.441442e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.441442e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.541232 sec + 2,598,862,718 cycles # 1.682 GHz + 4,064,975,706 instructions # 1.56 insn per cycle + 1.546247038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index a1484b6f26..5667ce458e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,110 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:16:13 +DATE: 2024-01-30_05:04:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.650880e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.304183e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.677107e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.453043 sec + 1,889,864,608 cycles # 2.824 GHz + 2,684,689,341 instructions # 1.42 insn per cycle + 0.749142975 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 742,362,445 cycles:u # 0.759 GHz (75.80%) - 2,746,219 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.82%) - 37,618,922 stalled-cycles-backend:u # 5.07% backend cycles idle (75.84%) - 1,244,048,946 instructions:u # 1.68 insn per cycle - # 0.03 stalled cycles per insn (74.19%) - 1.005676312 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 2,664,918,285 cycles:u # 2.823 GHz (74.95%) - 21,119,267 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.24%) - 863,828,171 stalled-cycles-backend:u # 32.41% backend cycles idle (74.29%) - 2,528,341,905 instructions:u # 0.95 insn per cycle - # 0.34 stalled cycles per insn (74.95%) - 0.965928637 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.266493e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.111955e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.526543e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.538455 sec + 2,216,644,376 cycles # 2.828 GHz + 3,102,394,165 instructions # 1.40 insn per cycle + 0.841378524 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x693a290) on address 0x1460ca129000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x146460faf372 in ??? -#1 0x146460fae505 in ??? -#2 0x14645f4a2dbf in ??? -#3 0x14645f4a2d2b in ??? -#4 0x14645f4a43e4 in ??? -#5 0x146457975b64 in ??? -#6 0x146457972b38 in ??? -#7 0x146457930496 in ??? -#8 0x14645f43c6e9 in ??? -#9 0x14645f57049e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.173059e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.192781e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.419980 sec - 5,025,685,634 cycles:u # 3.485 GHz (74.91%) - 2,331,310 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) - 671,647,332 stalled-cycles-backend:u # 13.36% backend cycles idle (75.04%) - 13,827,806,422 instructions:u # 2.75 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 1.444043088 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.822300e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003024e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003024e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.695633 sec + 4,892,910,077 cycles # 2.883 GHz + 13,801,787,359 instructions # 2.82 insn per cycle + 1.705964185 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x667850) on address 0x1454f3e09000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.896648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.972375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.972375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.887238 sec + 2,571,261,116 cycles # 2.883 GHz + 7,401,200,610 instructions # 2.88 insn per cycle + 0.906229412 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.154928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367723e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367723e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.543357 sec + 1,480,133,709 cycles # 2.701 GHz + 3,136,765,286 instructions # 2.12 insn per cycle + 0.561297241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.571891e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.844626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.844626e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.482249 sec + 1,314,348,676 cycles # 2.699 GHz + 2,923,288,921 instructions # 2.22 insn per cycle + 0.498803372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.408041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.532332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532332e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.706292 sec + 1,273,944,985 cycles # 1.792 GHz + 1,900,262,296 instructions # 1.49 insn per cycle + 0.723222352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 68e3ce30cc..7b59743406 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,117 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:47:03 +DATE: 2024-01-30_05:48:29 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.408359e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.101986e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.101986e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.481872 sec + 1,962,034,459 cycles # 2.824 GHz + 2,925,170,965 instructions # 1.49 insn per cycle + 0.753942373 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 955,437,706 cycles:u # 2.461 GHz (75.24%) - 2,971,678 stalled-cycles-frontend:u # 0.31% frontend cycles idle (75.50%) - 29,904,537 stalled-cycles-backend:u # 3.13% backend cycles idle (74.96%) - 1,413,759,904 instructions:u # 1.48 insn per cycle - # 0.02 stalled cycles per insn (74.79%) - 0.553131920 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 3,241,797,323 cycles:u # 2.861 GHz (74.89%) - 30,143,228 stalled-cycles-frontend:u # 0.93% frontend cycles idle (75.30%) - 856,597,171 stalled-cycles-backend:u # 26.42% backend cycles idle (75.27%) - 3,336,680,100 instructions:u # 1.03 insn per cycle - # 0.26 stalled cycles per insn (75.27%) - 1.407429317 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.119182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.257748e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.257748e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.770186 sec + 2,924,566,680 cycles # 2.837 GHz + 4,475,846,392 instructions # 1.53 insn per cycle + 1.089161093 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x693a290) on address 0x14694c129000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x146ce2f9f372 in ??? -#1 0x146ce2f9e505 in ??? -#2 0x146ce1492dbf in ??? -#3 0x146ce1492d2b in ??? -#4 0x146ce14943e4 in ??? -#5 0x146cd9965b64 in ??? -#6 0x146cd9962b38 in ??? -#7 0x146cd9920496 in ??? -#8 0x146ce142c6e9 in ??? -#9 0x146ce156049e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.173197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.192894e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.423616 sec - 5,029,275,447 cycles:u # 3.480 GHz (74.88%) - 2,400,452 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.09%) - 687,995,818 stalled-cycles-backend:u # 13.68% backend cycles idle (75.09%) - 13,816,443,402 instructions:u # 2.75 insn per cycle - # 0.05 stalled cycles per insn (75.10%) - 1.447260926 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.824024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002926e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.698586 sec + 4,927,814,709 cycles # 2.894 GHz + 13,806,118,322 instructions # 2.80 insn per cycle + 1.704123738 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x667850) on address 0x1467fdb79000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.886173e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963508e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.900867 sec + 2,618,017,951 cycles # 2.892 GHz + 7,450,102,141 instructions # 2.85 insn per cycle + 0.906367581 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.122916e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345144e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.557525 sec + 1,528,674,468 cycles # 2.721 GHz + 3,187,083,360 instructions # 2.08 insn per cycle + 0.563020024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.528840e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.810605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.810605e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.496872 sec + 1,359,999,193 cycles # 2.712 GHz + 2,973,904,476 instructions # 2.19 insn per cycle + 0.502643224 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.332416e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.457397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.457397e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.738182 sec + 1,327,509,915 cycles # 1.788 GHz + 1,939,124,841 instructions # 1.46 insn per cycle + 0.743808066 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 2b6e489945..4deacb88f2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,110 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:16:22 +DATE: 2024-01-30_05:04:44 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.642894e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200887e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.567165e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.451244 sec + 1,883,873,657 cycles # 2.821 GHz + 2,671,262,226 instructions # 1.42 insn per cycle + 0.747348766 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 722,585,158 cycles:u # 2.137 GHz (75.90%) - 2,599,585 stalled-cycles-frontend:u # 0.36% frontend cycles idle (76.10%) - 39,487,772 stalled-cycles-backend:u # 5.46% backend cycles idle (75.40%) - 1,266,219,372 instructions:u # 1.75 insn per cycle - # 0.03 stalled cycles per insn (71.84%) - 0.360995823 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 2,673,014,627 cycles:u # 2.809 GHz (74.77%) - 21,614,665 stalled-cycles-frontend:u # 0.81% frontend cycles idle (73.29%) - 863,129,959 stalled-cycles-backend:u # 32.29% backend cycles idle (74.35%) - 2,509,612,374 instructions:u # 0.94 insn per cycle - # 0.34 stalled cycles per insn (76.13%) - 0.970743750 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.228371e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.990649e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.395918e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.540844 sec + 2,218,030,903 cycles # 2.829 GHz + 3,154,136,532 instructions # 1.42 insn per cycle + 0.843504278 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x693a1e0) on address 0x14e47c679000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14e7134f5372 in ??? -#1 0x14e7134f4505 in ??? -#2 0x14e7119e8dbf in ??? -#3 0x14e7119e8d2b in ??? -#4 0x14e7119ea3e4 in ??? -#5 0x14e709ebbb64 in ??? -#6 0x14e709eb8b38 in ??? -#7 0x14e709e76496 in ??? -#8 0x14e7119826e9 in ??? -#9 0x14e711ab649e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.173950e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193612e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.193612e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.418629 sec - 5,015,341,842 cycles:u # 3.484 GHz (75.00%) - 2,143,149 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 879,065,341 stalled-cycles-backend:u # 17.53% backend cycles idle (75.00%) - 13,836,256,912 instructions:u # 2.76 insn per cycle - # 0.06 stalled cycles per insn (75.00%) - 1.441391086 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.831536e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003712e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.690067 sec + 4,884,610,591 cycles # 2.883 GHz + 13,807,943,276 instructions # 2.83 insn per cycle + 1.700194727 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6664d0) on address 0x14bb63199000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.876876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953061e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.896918 sec + 2,573,000,483 cycles # 2.854 GHz + 7,407,132,972 instructions # 2.88 insn per cycle + 0.971480588 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.133331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.344053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.344053e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.546739 sec + 1,486,856,812 cycles # 2.696 GHz + 3,137,676,944 instructions # 2.11 insn per cycle + 0.563341736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.567673e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.839669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.839669e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.482732 sec + 1,314,507,412 cycles # 2.697 GHz + 2,925,746,939 instructions # 2.23 insn per cycle + 0.501062508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.394430e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.516439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.516439e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.710071 sec + 1,273,890,672 cycles # 1.782 GHz + 1,899,944,131 instructions # 1.49 insn per cycle + 0.727352268 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 1700562c87..1362a87ac8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,110 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:16:31 +DATE: 2024-01-30_05:05:03 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.327203e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210086e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.349272e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.446159 sec + 1,908,363,704 cycles # 2.829 GHz + 2,678,040,252 instructions # 1.40 insn per cycle + 0.749417997 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 749,803,031 cycles:u # 2.249 GHz (74.35%) - 2,542,073 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.95%) - 32,510,347 stalled-cycles-backend:u # 4.34% backend cycles idle (76.93%) - 1,235,780,352 instructions:u # 1.65 insn per cycle - # 0.03 stalled cycles per insn (75.33%) - 0.357181764 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,487,432,579 cycles:u # 2.810 GHz (75.50%) - 21,154,833 stalled-cycles-frontend:u # 0.85% frontend cycles idle (75.28%) - 860,477,044 stalled-cycles-backend:u # 34.59% backend cycles idle (74.55%) - 2,480,935,938 instructions:u # 1.00 insn per cycle - # 0.35 stalled cycles per insn (75.06%) - 0.905942536 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.267889e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.817352e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969269e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.483640 sec + 2,013,701,507 cycles # 2.833 GHz + 2,869,047,503 instructions # 1.42 insn per cycle + 0.770237631 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x69382b0) on address 0x14696161c000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x146bf848d372 in ??? -#1 0x146bf848c505 in ??? -#2 0x146bf6982dbf in ??? -#3 0x146bf6982d2b in ??? -#4 0x146bf69843e4 in ??? -#5 0x146beee55b64 in ??? -#6 0x146beee52b38 in ??? -#7 0x146beee10496 in ??? -#8 0x146bf691c6e9 in ??? -#9 0x146bf6a5049e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.428982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.459192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.459192e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.167732 sec - 4,148,960,302 cycles:u # 3.491 GHz (74.76%) - 2,278,304 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.05%) - 246,543,940 stalled-cycles-backend:u # 5.94% backend cycles idle (75.10%) - 12,642,512,101 instructions:u # 3.05 insn per cycle - # 0.02 stalled cycles per insn (75.11%) - 1.190458747 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.109983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136218e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136218e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.498447 sec + 4,345,988,139 cycles # 2.893 GHz + 12,596,967,872 instructions # 2.90 insn per cycle + 1.511882134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x645490) on address 0x14bb8f15c000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.116392e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.330955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330955e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547469 sec + 1,595,191,710 cycles # 2.889 GHz + 4,246,785,925 instructions # 2.66 insn per cycle + 0.566121323 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.705122e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.431194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.431194e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.309154 sec + 853,106,357 cycles # 2.719 GHz + 1,916,236,758 instructions # 2.25 insn per cycle + 0.322202646 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.291153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.186493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.186493e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.282114 sec + 781,605,305 cycles # 2.726 GHz + 1,797,850,243 instructions # 2.30 insn per cycle + 0.301017972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.544342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.998908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998908e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.384301 sec + 720,859,118 cycles # 1.854 GHz + 1,288,039,773 instructions # 1.79 insn per cycle + 0.402338897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index f7ef249b0b..8cb59221d4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,117 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:47:13 +DATE: 2024-01-30_05:48:48 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.444132e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.000397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.000397e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.460393 sec + 1,902,959,760 cycles # 2.835 GHz + 2,813,040,217 instructions # 1.48 insn per cycle + 0.731144371 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 733,284,638 cycles:u # 2.220 GHz (75.38%) - 2,873,339 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.28%) - 39,969,279 stalled-cycles-backend:u # 5.45% backend cycles idle (74.66%) - 1,260,934,352 instructions:u # 1.72 insn per cycle - # 0.03 stalled cycles per insn (75.93%) - 0.359797530 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,819,121,395 cycles:u # 2.794 GHz (75.71%) - 28,954,822 stalled-cycles-frontend:u # 1.03% frontend cycles idle (75.44%) - 853,775,539 stalled-cycles-backend:u # 30.29% backend cycles idle (75.44%) - 3,119,187,632 instructions:u # 1.11 insn per cycle - # 0.27 stalled cycles per insn (75.40%) - 1.028219917 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.962207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.533260e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.533260e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.631701 sec + 2,471,933,418 cycles # 2.836 GHz + 3,725,494,141 instructions # 1.51 insn per cycle + 0.929474422 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x69382b0) on address 0x15013b1bc000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x1503d202b372 in ??? -#1 0x1503d202a505 in ??? -#2 0x1503d0520dbf in ??? -#3 0x1503d0520d2b in ??? -#4 0x1503d05223e4 in ??? -#5 0x1503c89f3b64 in ??? -#6 0x1503c89f0b38 in ??? -#7 0x1503c89ae496 in ??? -#8 0x1503d04ba6e9 in ??? -#9 0x1503d05ee49e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.427180e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.457374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.457374e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.171262 sec - 4,152,238,121 cycles:u # 3.482 GHz (74.65%) - 2,263,727 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.95%) - 248,276,893 stalled-cycles-backend:u # 5.98% backend cycles idle (75.18%) - 12,634,997,297 instructions:u # 3.04 insn per cycle - # 0.02 stalled cycles per insn (75.19%) - 1.194742461 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.095228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.121318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.121318e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.522701 sec + 4,367,827,655 cycles # 2.862 GHz + 12,601,331,452 instructions # 2.89 insn per cycle + 1.527862957 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x645490) on address 0x1493c8844000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.075499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.292736e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292736e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.559885 sec + 1,623,222,211 cycles # 2.878 GHz + 4,293,732,841 instructions # 2.65 insn per cycle + 0.565168184 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.618798e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.338072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.338072e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.317968 sec + 874,954,516 cycles # 2.715 GHz + 1,952,010,632 instructions # 2.23 insn per cycle + 0.323135602 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.140069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.015278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.015278e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.296217 sec + 805,080,990 cycles # 2.697 GHz + 1,834,280,964 instructions # 2.28 insn per cycle + 0.301462842 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.472935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.920053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.920053e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.395002 sec + 745,120,207 cycles # 1.866 GHz + 1,329,072,598 instructions # 1.78 insn per cycle + 0.400211929 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index d199db87ba..a71ead3e03 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,110 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:16:40 +DATE: 2024-01-30_05:05:20 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.328749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215965e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.352409e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.447516 sec + 1,904,038,107 cycles # 2.819 GHz + 2,679,740,960 instructions # 1.41 insn per cycle + 0.754557698 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 729,890,155 cycles:u # 2.185 GHz (76.20%) - 2,776,032 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.23%) - 41,483,645 stalled-cycles-backend:u # 5.68% backend cycles idle (71.44%) - 1,265,802,703 instructions:u # 1.73 insn per cycle - # 0.03 stalled cycles per insn (73.74%) - 0.356567682 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 2,586,507,860 cycles:u # 2.874 GHz (74.53%) - 20,964,775 stalled-cycles-frontend:u # 0.81% frontend cycles idle (75.43%) - 845,541,977 stalled-cycles-backend:u # 32.69% backend cycles idle (75.13%) - 2,441,025,850 instructions:u # 0.94 insn per cycle - # 0.35 stalled cycles per insn (75.35%) - 0.918884236 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.182679e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.774687e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914662e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.483893 sec + 2,007,596,287 cycles # 2.824 GHz + 2,863,986,921 instructions # 1.43 insn per cycle + 0.770182944 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6938200) on address 0x14a208bdc000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14a49fa51372 in ??? -#1 0x14a49fa50505 in ??? -#2 0x14a49df46dbf in ??? -#3 0x14a49df46d2b in ??? -#4 0x14a49df483e4 in ??? -#5 0x14a496419b64 in ??? -#6 0x14a496416b38 in ??? -#7 0x14a4963d4496 in ??? -#8 0x14a49dee06e9 in ??? -#9 0x14a49e01449e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.424910e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.455027e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.455027e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.170740 sec - 4,155,869,791 cycles:u # 3.488 GHz (74.64%) - 1,914,547 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.96%) - 509,366,618 stalled-cycles-backend:u # 12.26% backend cycles idle (75.16%) - 12,626,103,775 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (75.17%) - 1.193298382 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.104449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131163e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.505306 sec + 4,350,737,729 cycles # 2.883 GHz + 12,588,700,465 instructions # 2.89 insn per cycle + 1.517040580 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x644060) on address 0x152b34aec000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.107801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322563e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.548678 sec + 1,589,053,041 cycles # 2.872 GHz + 4,241,478,972 instructions # 2.67 insn per cycle + 0.565533397 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.682195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.406347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406347e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.310286 sec + 851,032,417 cycles # 2.702 GHz + 1,913,907,734 instructions # 2.25 insn per cycle + 0.327654627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.251030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.131063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131063e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.283621 sec + 779,432,148 cycles # 2.704 GHz + 1,795,928,128 instructions # 2.30 insn per cycle + 0.301196370 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.530328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.979352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.979352e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.386557 sec + 722,333,254 cycles # 1.844 GHz + 1,287,373,146 instructions # 1.78 insn per cycle + 0.407217093 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 1f1050ab1d..3f17b073e2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,110 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:16:49 +DATE: 2024-01-30_05:05:38 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.696364e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.334716e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.710197e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.455743 sec + 1,899,569,009 cycles # 2.822 GHz + 2,690,270,670 instructions # 1.42 insn per cycle + 0.752301124 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 726,421,911 cycles:u # 2.167 GHz (76.90%) - 2,699,812 stalled-cycles-frontend:u # 0.37% frontend cycles idle (76.94%) - 39,906,193 stalled-cycles-backend:u # 5.49% backend cycles idle (74.90%) - 1,292,140,442 instructions:u # 1.78 insn per cycle - # 0.03 stalled cycles per insn (72.28%) - 0.356876792 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 2,747,293,496 cycles:u # 2.868 GHz (72.77%) - 21,347,674 stalled-cycles-frontend:u # 0.78% frontend cycles idle (74.38%) - 843,785,312 stalled-cycles-backend:u # 30.71% backend cycles idle (76.08%) - 2,518,401,996 instructions:u # 0.92 insn per cycle - # 0.34 stalled cycles per insn (75.72%) - 0.977993834 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.256330e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.562668e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.544746 sec + 2,203,075,600 cycles # 2.810 GHz + 3,150,811,707 instructions # 1.43 insn per cycle + 0.843284004 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x693a290) on address 0x147ca1d69000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x147f38bda372 in ??? -#1 0x147f38bd9505 in ??? -#2 0x147f370cddbf in ??? -#3 0x147f370cdd2b in ??? -#4 0x147f370cf3e4 in ??? -#5 0x147f2f5a0b64 in ??? -#6 0x147f2f59db38 in ??? -#7 0x147f2f55b496 in ??? -#8 0x147f370676e9 in ??? -#9 0x147f3719b49e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.168410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187978e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187978e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.425565 sec - 5,053,853,375 cycles:u # 3.490 GHz (74.65%) - 2,147,400 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.86%) - 857,993,584 stalled-cycles-backend:u # 16.98% backend cycles idle (75.11%) - 13,839,852,873 instructions:u # 2.74 insn per cycle - # 0.06 stalled cycles per insn (75.15%) - 1.450066232 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.796791e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000139e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.696226 sec + 4,903,205,903 cycles # 2.884 GHz + 13,824,553,372 instructions # 2.82 insn per cycle + 1.707005330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x667850) on address 0x1477c4599000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.870381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.944831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.944831e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.899365 sec + 2,603,553,029 cycles # 2.880 GHz + 7,349,607,266 instructions # 2.82 insn per cycle + 0.916195330 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.167537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.382178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382178e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.541013 sec + 1,471,630,021 cycles # 2.697 GHz + 3,084,577,547 instructions # 2.10 insn per cycle + 0.558891839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.661938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.948590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.948590e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.471097 sec + 1,285,426,170 cycles # 2.700 GHz + 2,873,286,331 instructions # 2.24 insn per cycle + 0.489244149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.322096e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.437722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.437722e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.731479 sec + 1,311,962,532 cycles # 1.782 GHz + 1,915,335,630 instructions # 1.46 insn per cycle + 0.746286183 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index f8c7f13430..7294ddea09 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,110 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-28_13:16:58 +DATE: 2024-01-30_05:05:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.635631e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.151573e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502163e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.454560 sec + 1,887,319,720 cycles # 2.810 GHz + 2,686,521,155 instructions # 1.42 insn per cycle + 0.777570467 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 735,921,519 cycles:u # 2.199 GHz (76.38%) - 2,583,381 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.85%) - 39,817,056 stalled-cycles-backend:u # 5.41% backend cycles idle (74.64%) - 1,284,923,774 instructions:u # 1.75 insn per cycle - # 0.03 stalled cycles per insn (72.51%) - 0.356777627 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 2,678,401,129 cycles:u # 2.826 GHz (74.65%) - 21,071,117 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.83%) - 849,774,878 stalled-cycles-backend:u # 31.73% backend cycles idle (75.53%) - 2,561,641,144 instructions:u # 0.96 insn per cycle - # 0.33 stalled cycles per insn (75.48%) - 0.986518250 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.262333e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.410963e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.538673 sec + 2,205,224,099 cycles # 2.822 GHz + 3,150,366,927 instructions # 1.43 insn per cycle + 0.838863876 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x693a1e0) on address 0x154e15329000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x1551ac1af372 in ??? -#1 0x1551ac1ae505 in ??? -#2 0x1551aa6a2dbf in ??? -#3 0x1551aa6a2d2b in ??? -#4 0x1551aa6a43e4 in ??? -#5 0x1551a2b75b64 in ??? -#6 0x1551a2b72b38 in ??? -#7 0x1551a2b30496 in ??? -#8 0x1551aa63c6e9 in ??? -#9 0x1551aa77049e in ??? -#10 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.169455e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.189040e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.189040e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.424073 sec - 5,045,556,451 cycles:u # 3.490 GHz (74.78%) - 2,489,730 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) - 792,720,433 stalled-cycles-backend:u # 15.71% backend cycles idle (75.11%) - 13,852,579,012 instructions:u # 2.75 insn per cycle - # 0.06 stalled cycles per insn (75.11%) - 1.447924472 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.769998e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.971532e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.971532e+04 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.700454 sec + 4,910,062,395 cycles # 2.880 GHz + 13,831,764,171 instructions # 2.82 insn per cycle + 1.712052278 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x6664d0) on address 0x14ef36f79000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.857842e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.932046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932046e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.905414 sec + 2,615,099,772 cycles # 2.873 GHz + 7,353,136,311 instructions # 2.81 insn per cycle + 0.925236073 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.160999e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.374264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.374264e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.541919 sec + 1,475,084,747 cycles # 2.698 GHz + 3,084,915,220 instructions # 2.09 insn per cycle + 0.559487031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.676411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.967587e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.469154 sec + 1,285,211,957 cycles # 2.712 GHz + 2,875,140,516 instructions # 2.24 insn per cycle + 0.485058196 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.334432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.451352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.451352e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.728206 sec + 1,313,839,367 cycles # 1.794 GHz + 1,915,620,790 instructions # 1.46 insn per cycle + 0.743678029 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED From 6c42fc2a62535e86bf02acfb39a5803ca98d2d50 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Tue, 30 Jan 2024 10:41:38 +0100 Subject: [PATCH 92/96] [jt774] rerun 18 tmad tests on itscrd90, all ok [NB using code generated before Olivier's commit] STARTED AT Tue Jan 30 06:02:30 AM CET 2024 ENDED AT Tue Jan 30 10:37:48 AM CET 2024 Status=0 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 24 /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 390 +++++++++++----- .../log_eemumu_mad_f_inl0_hrd0.txt | 418 +++++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 388 ++++++++++----- .../log_ggtt_mad_d_inl0_hrd0.txt | 390 +++++++++++----- .../log_ggtt_mad_f_inl0_hrd0.txt | 416 +++++++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 392 +++++++++++----- .../log_ggttg_mad_d_inl0_hrd0.txt | 390 +++++++++++----- .../log_ggttg_mad_f_inl0_hrd0.txt | 414 ++++++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 390 +++++++++++----- .../log_ggttgg_mad_d_inl0_hrd0.txt | 394 +++++++++++----- .../log_ggttgg_mad_f_inl0_hrd0.txt | 416 +++++++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 394 +++++++++++----- .../log_ggttggg_mad_d_inl0_hrd0.txt | 394 +++++++++++----- .../log_ggttggg_mad_f_inl0_hrd0.txt | 418 +++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 392 +++++++++++----- .../log_gqttq_mad_d_inl0_hrd0.txt | 417 +++++++++++++---- .../log_gqttq_mad_f_inl0_hrd0.txt | 441 +++++++++++++----- .../log_gqttq_mad_m_inl0_hrd0.txt | 415 ++++++++++++---- 18 files changed, 5091 insertions(+), 2178 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 6e36ff4f89..459e70d382 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-28_14:10:41 +DATE: 2024-01-30_06:09:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.4854s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4783s - [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6491s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6403s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1406s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1335s - [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1850s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0089s for 8192 events => throughput is 9.22E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3372s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2609s - [COUNTERS] Fortran MEs ( 1 ) : 0.0763s for 90112 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4417s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3482s + [COUNTERS] Fortran MEs ( 1 ) : 0.0935s for 90112 events => throughput is 9.64E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1478s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1896s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3339s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2687s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0652s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3566s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0811s for 90112 events => throughput is 1.11E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.415290e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.115404e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.439208e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.135181e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1436s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1401s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1812s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.96E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3044s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2659s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0385s for 90112 events => throughput is 2.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4018s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0476s for 90112 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.384930e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.873422e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.430126e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.997339e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1405s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1381s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2972s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2702s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0271s for 90112 events => throughput is 3.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3558s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 90112 events => throughput is 2.53E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.422146e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.563581e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.701936e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1820s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1791s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.77E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3550s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0336s for 90112 events => throughput is 2.68E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.733041e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.551254e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.837193e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1852s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3589s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0397s for 90112 events => throughput is 2.27E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.277912e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.388817e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,18 +505,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.4130s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4126s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.02E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6311s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.5450s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5405s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 2.02E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7893s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.73E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.227145e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.924722e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.604349e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.934037e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.300423e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.691813e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.897629e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.449601e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.338370e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.683159e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.949984e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.033957e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.235340e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.708340e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.550220e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.130631e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index af7e96ea68..161c62cc9b 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-28_14:10:55 +DATE: 2024-01-30_06:09:46 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.4650s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4578s - [COUNTERS] Fortran MEs ( 1 ) : 0.0072s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6495s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6408s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.47E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1413s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1342s - [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.59E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3373s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2610s - [COUNTERS] Fortran MEs ( 1 ) : 0.0764s for 90112 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3487s + [COUNTERS] Fortran MEs ( 1 ) : 0.0934s for 90112 events => throughput is 9.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165804194712] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1458s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1406s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1909s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165804194712) differ by less than 4E-4 (1.4992696539817274e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165492032638) differ by less than 4E-4 (1.6428111293542713e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501906417650977E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2722s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0565s for 90112 events => throughput is 1.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3569s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0773s for 90112 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501906417650977E-002) differ by less than 4E-4 (1.473975923538262e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905274264717E-002) differ by less than 4E-4 (1.5989335488963974e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.646088e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.185144e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.666997e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.215006e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747170102104579] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1404s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1383s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.90E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1786s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747170102104579) differ by less than 4E-4 (4.77038091251103e-08) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165570339780) differ by less than 4E-4 (1.6068031594151932e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501924220365086E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3531s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0293s for 90112 events => throughput is 3.08E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501924220365086E-002) differ by less than 4E-4 (4.716350665567859e-08) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263464411127e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.048771e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.133589e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.232321e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.329458e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747170107722075] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1370s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.66E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1817s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.43E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747170107722075) differ by less than 4E-4 (4.7962118499000894e-08) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501924223714337E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2845s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2651s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 90112 events => throughput is 4.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0262s for 90112 events => throughput is 3.44E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501924223714337E-002) differ by less than 4E-4 (4.720010982062206e-08) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.878508e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.621496e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.716307e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1824s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.63E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 90112 events => throughput is 3.63E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.744053e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.048712e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.971296e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1822s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1798s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.36E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166446533123) differ by less than 4E-4 (1.2039032049049325e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 90112 events => throughput is 3.42E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501908990866423E-002) differ by less than 4E-4 (1.1927560927826875e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.615840e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.903927e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166473699145] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.4212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4209s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.93E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6073s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166473699145) differ by less than 4E-4 (1.1914114539379739e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166823487174) differ by less than 4E-4 (1.0305684361444634e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501909133729520E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.5440s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5410s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 3.01E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.89E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501909133729520E-002) differ by less than 4E-4 (1.1771429675455636e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439961927435e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.778755e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.032746e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.253473e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.810870e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.336334e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.874936e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699323e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.028452e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.325474e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.891915e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.852032e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.234607e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106896e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.256002e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.950493e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.441320e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index cf5ccd68a6..f51b70af46 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-28_14:11:09 +DATE: 2024-01-30_06:10:03 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.4758s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4687s - [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6504s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6416s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1415s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1343s - [COUNTERS] Fortran MEs ( 1 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1761s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3384s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2620s - [COUNTERS] Fortran MEs ( 1 ) : 0.0763s for 90112 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4424s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3488s + [COUNTERS] Fortran MEs ( 1 ) : 0.0936s for 90112 events => throughput is 9.63E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1475s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1415s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1857s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3333s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0655s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3600s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 90112 events => throughput is 1.10E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.418226e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.101615e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.426804e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.125271e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1425s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1390s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.94E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3047s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2664s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0383s for 90112 events => throughput is 2.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4024s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3561s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 90112 events => throughput is 1.94E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503318e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.982524e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.509365e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.072335e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1441s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1416s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0026s for 8192 events => throughput is 3.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2933s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2653s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0280s for 90112 events => throughput is 3.22E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3912s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3545s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 90112 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.355475e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.431608e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.633191e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 90112 events => throughput is 2.63E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.754832e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.423534e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.841477e+06 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1840s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3972s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3574s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 90112 events => throughput is 2.26E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.333320e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.456277e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,18 +505,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.4127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4122s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 1.96E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6100s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6095s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.62E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919911173596E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.5439s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5395s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.07E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7838s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.74E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173596E-002) differ by less than 2E-4 (6.950595654586778e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.213267e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.926990e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.613214e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.883911e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.277391e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.714682e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.902459e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.463238e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.271988e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.709935e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.955276e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.999719e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.232579e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.716961e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.550864e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.154425e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index da4f7b996b..6a2d60f404 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-28_14:11:24 +DATE: 2024-01-30_06:10:22 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3530s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3174s - [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4078s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3635s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2591s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2234s - [COUNTERS] Fortran MEs ( 1 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3357s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4005s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0114s - [COUNTERS] Fortran MEs ( 1 ) : 0.3891s for 90112 events => throughput is 2.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8736s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3937s + [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2907s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2587s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3307s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3973s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0452s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3521s for 90112 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8532s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4213s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4319s for 90112 events => throughput is 2.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.583416e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.120291e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.597741e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.118287e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2630s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2453s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0177s for 8192 events => throughput is 4.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 8192 events => throughput is 3.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2297s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0339s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1958s for 90112 events => throughput is 4.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6510s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2487s for 90112 events => throughput is 3.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.723096e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.657266e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.742482e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.745669e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2369s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 7.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3190s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3046s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.69E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1362s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1123s for 90112 events => throughput is 8.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5517s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3928s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1589s for 90112 events => throughput is 5.67E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.230738e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.690369e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.998122e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3210s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.64E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5235s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 90112 events => throughput is 6.64E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.740357e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.316019e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.934348e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3284s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.6116s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3942s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2174s for 90112 events => throughput is 4.15E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.190038e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.219873e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.5082s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5076s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.39E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3011s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2934s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8076s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8005s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 90112 events => throughput is 1.27E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.602602e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.036838e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.046178e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.660860e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.787613e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.989378e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.754824e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.069849e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.790261e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.996952e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.940647e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.150687e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.756503e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991689e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.140255e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.999027e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 5e558ca142..fe11b37e1c 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-28_14:11:44 +DATE: 2024-01-30_06:10:49 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.2922s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2567s - [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3836s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3391s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2631s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2275s - [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3381s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4061s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0173s - [COUNTERS] Fortran MEs ( 1 ) : 0.3888s for 90112 events => throughput is 2.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3952s + [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690704859565443] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2843s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2568s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690704859565443) differ by less than 4E-4 (7.167087245907311e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703999052587) differ by less than 4E-4 (8.971448917094449e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780988783773] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3409s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0385s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3024s for 90112 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8092s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4011s for 90112 events => throughput is 2.25E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223780988783773) differ by less than 4E-4 (2.818877065102754e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780103711483) differ by less than 4E-4 (4.733632297249102e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.089416e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.286506e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.098377e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.292834e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703261737937] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2530s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2406s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0124s for 8192 events => throughput is 6.60E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3043s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703261737937) differ by less than 4E-4 (1.0517483095551228e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690699958440689) differ by less than 4E-4 (1.744398380187917e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223779141681696] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1645s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0282s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1363s for 90112 events => throughput is 6.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5555s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1657s for 90112 events => throughput is 5.44E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223779141681696) differ by less than 4E-4 (6.814876496452626e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223776162337749) differ by less than 4E-4 (1.326035499182865e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.730231e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.487444e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.517443e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.523247e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690694815027769] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2387s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2325s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3066s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2985s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690694815027769) differ by less than 4E-4 (2.822892096743246e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223776468660162] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.0930s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0245s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0685s for 90112 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4802s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223776468660162) differ by less than 4E-4 (1.2597660603574923e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.369414e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.007163e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.012972e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3042s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.4699s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3862s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 90112 events => throughput is 1.08E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.018834e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.383290e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.101159e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.33E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690698822141186) differ by less than 4E-4 (1.982662718447159e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5088s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3876s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1212s for 90112 events => throughput is 7.43E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223780266165058) differ by less than 4E-4 (4.382182106077437e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.590700e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.687831e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690697792016209] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.5094s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5090s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.04E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690697792016209) differ by less than 4E-4 (2.1986639087145932e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697987) differ by less than 4E-4 (1.0232396008280631e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223779043453291] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3022s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2984s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0038s for 90112 events => throughput is 2.38E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8220s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 90112 events => throughput is 1.47E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223779043453291) differ by less than 4E-4 (7.027382697977202e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376677454826e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.878193e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.211265e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.214469e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.993599e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.092154e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.733080e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.038194e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.767769e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066187e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.726692e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.098451e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.882266e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.314044e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.370639e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.528445e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.407782e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 48535fd982..a855e5b8c2 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-28_14:12:04 +DATE: 2024-01-30_06:11:16 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.2926s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2570s - [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3775s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3336s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/vala [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2602s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2247s - [COUNTERS] Fortran MEs ( 1 ) : 0.0355s for 8192 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4015s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0124s - [COUNTERS] Fortran MEs ( 1 ) : 0.3891s for 90112 events => throughput is 2.32E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8813s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4017s + [COUNTERS] Fortran MEs ( 1 ) : 0.4796s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2585s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 8192 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4064s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3629s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8620s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4398s for 90112 events => throughput is 2.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.545875e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.094601e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.552634e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.081772e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2622s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3338s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2218s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0319s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1899s for 90112 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6431s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2435s for 90112 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.798699e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.699859e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.790478e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.728578e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2382s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3066s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.81E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1353s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0253s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1100s for 90112 events => throughput is 8.19E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5524s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3953s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1572s for 90112 events => throughput is 5.73E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.461642e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.934952e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.894708e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.74E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5411s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4039s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1372s for 90112 events => throughput is 6.57E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.847513e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.506859e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.835511e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3300s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.6128s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4020s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2108s for 90112 events => throughput is 4.27E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.284904e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.343219e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708266690713] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.5105s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5098s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7251s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690713) differ by less than 2E-4 (2.2875323857363128e-10) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690706) differ by less than 2E-4 (2.2875334959593374e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782303744798] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3005s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2929s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 90112 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8142s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744798) differ by less than 2E-4 (2.5894508759449764e-10) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.621016e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.009523e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.051466e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.569379e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.842315e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.989648e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799914e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.069002e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.799451e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.032122e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.142771e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.783243e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.993358e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161657e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.008591e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index e6ba098a45..ad1d0f839b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 - +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:12:24 +DATE: 2024-01-30_06:11:43 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3051s - [COUNTERS] Fortran MEs ( 1 ) : 0.2704s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5934s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s + [COUNTERS] Fortran MEs ( 1 ) : 0.3485s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4621s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1916s - [COUNTERS] Fortran MEs ( 1 ) : 0.2705s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2379s + [COUNTERS] Fortran MEs ( 1 ) : 0.3492s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.1239s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1526s - [COUNTERS] Fortran MEs ( 1 ) : 2.9713s for 90112 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5565s + [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.7513s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4690s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2822s for 8192 events => throughput is 2.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3431s for 8192 events => throughput is 2.39E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.5538s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4326s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1212s for 90112 events => throughput is 2.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7045s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9313s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7731s for 90112 events => throughput is 2.39E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655597E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.975541e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.459637e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.988809e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.457759e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 8192 events => throughput is 6.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5930s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1764s for 8192 events => throughput is 4.64E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7692s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2857s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4836s for 90112 events => throughput is 6.07E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7266s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7761s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9506s for 90112 events => throughput is 4.62E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.146640e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.735934e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.143050e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.731299e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2594s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0672s for 8192 events => throughput is 1.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0935s for 8192 events => throughput is 8.76E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9501s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2143s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7358s for 90112 events => throughput is 1.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6908s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6878s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0029s for 90112 events => throughput is 8.98E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,20 +332,92 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.271439e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.260876e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.275338e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.205229e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.4089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0830s for 8192 events => throughput is 9.87E+04 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) -*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.5356s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6675s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8681s for 90112 events => throughput is 1.04E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.066040e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.071802e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- CUDACPP_RUNTIME_FBRIDGEMODE = (not set) CUDACPP_RUNTIME_VECSIZEUSED = 8192 @@ -357,8 +429,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -366,14 +438,90 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4985s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4910s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1112s for 8192 events => throughput is 7.37E+04 events/s -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.9267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7103s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2164s for 90112 events => throughput is 7.41E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.521586e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.533151e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.6854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470764E-002) differ by less than 2E-14 (0.0) + *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.5299s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4472s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0826s for 90112 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0439s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655610E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.137973e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.630499e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.164776e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.083902e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.677550e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.662154e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.306439e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243596e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.677909e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.668083e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.842160e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.255740e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.664344e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.633959e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.408517e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.773665e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 4aced905e9..c17be1788d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:12:59 +DATE: 2024-01-30_06:12:28 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.4651s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1945s - [COUNTERS] Fortran MEs ( 1 ) : 0.2707s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s + [COUNTERS] Fortran MEs ( 1 ) : 0.3480s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4641s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1936s - [COUNTERS] Fortran MEs ( 1 ) : 0.2705s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5918s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2437s + [COUNTERS] Fortran MEs ( 1 ) : 0.3481s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.1307s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1595s - [COUNTERS] Fortran MEs ( 1 ) : 2.9712s for 90112 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5607s + [COUNTERS] Fortran MEs ( 1 ) : 3.8294s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349351077960E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.7014s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4439s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2575s for 8192 events => throughput is 3.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8754s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5552s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3203s for 8192 events => throughput is 2.56E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349351077960E-002) differ by less than 4E-4 (8.818635788276907e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347758884971E-002) differ by less than 4E-4 (1.0456755794585604e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310859412953768E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.2334s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4007s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8326s for 90112 events => throughput is 3.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4534s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9123s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5411s for 90112 events => throughput is 2.54E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310859412953768E-002) differ by less than 4E-4 (1.5575656098221202e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310858119443913E-002) differ by less than 4E-4 (1.7166476384833373e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.291524e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651171e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.300074e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.640512e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196335877214046E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3440s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2686s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3378s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0976s for 8192 events => throughput is 8.39E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196335877214046E-002) differ by less than 4E-4 (2.2681155120718444e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196323434217816E-002) differ by less than 4E-4 (3.548307125900152e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310850963848921E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0566s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2267s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8299s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6927s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0816s for 90112 events => throughput is 8.33E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310850963848921E-002) differ by less than 4E-4 (2.596676940136433e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842598054087E-002) differ by less than 4E-4 (3.625542406293647e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.105529e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.607319e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107065e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.623071e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196334589088509E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.2611s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2269s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0342s for 8192 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3345s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196334589088509E-002) differ by less than 4E-4 (2.400643681621517e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310850363433287E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.5565s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1812s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 90112 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1519s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5128s for 90112 events => throughput is 1.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310850363433287E-002) differ by less than 4E-4 (2.6705189259956796e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.459238e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.801449e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.824776e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.3234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.0864s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4508s for 90112 events => throughput is 2.00E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.066287e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.460557e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.073001e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.3518s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0546s for 8192 events => throughput is 1.50E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344080460087E-002) differ by less than 4E-4 (1.4241285339888776e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.2586s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6080s for 90112 events => throughput is 1.48E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310857813116089E-002) differ by less than 4E-4 (1.754321300451167e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.497722e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.492408e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196347207304232E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4824s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4804s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0020s for 8192 events => throughput is 4.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.66E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347207304232E-002) differ by less than 4E-4 (1.1024246959756567e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366366022E-002) differ by less than 4E-4 (8.802906736882221e-08) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310859763686641E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.4785s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4568s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 90112 events => throughput is 4.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0322s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0221s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 90112 events => throughput is 8.88E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310859763686641E-002) differ by less than 4E-4 (1.5144308029846343e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310864949473954E-002) differ by less than 4E-4 (8.766578729613173e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.767897e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.288695e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.558247e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.864373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.470164e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.630957e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.087387e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.365812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.452879e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.633946e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.637837e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471906e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425767e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.509519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.262697e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624050e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 1f4f9c704d..daa5ca9a3d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:13:29 +DATE: 2024-01-30_06:13:08 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.4648s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1944s - [COUNTERS] Fortran MEs ( 1 ) : 0.2704s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5921s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2413s + [COUNTERS] Fortran MEs ( 1 ) : 0.3507s for 8192 events => throughput is 2.34E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4660s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1951s - [COUNTERS] Fortran MEs ( 1 ) : 0.2708s for 8192 events => throughput is 3.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5875s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2393s + [COUNTERS] Fortran MEs ( 1 ) : 0.3482s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.1289s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1582s - [COUNTERS] Fortran MEs ( 1 ) : 2.9707s for 90112 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3863s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5579s + [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.7677s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4784s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2893s for 8192 events => throughput is 2.83E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9324s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3490s for 8192 events => throughput is 2.35E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.5962s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4302s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1661s for 90112 events => throughput is 2.85E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7762s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9305s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8457s for 90112 events => throughput is 2.34E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872835011053E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.921173e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.405521e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.914494e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.412039e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4632s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3274s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 8192 events => throughput is 6.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5895s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4134s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1762s for 8192 events => throughput is 4.65E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7762s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2846s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4916s for 90112 events => throughput is 6.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7571s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7661s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9910s for 90112 events => throughput is 4.53E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872836789727E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.188450e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.812264e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.198899e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.790866e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3249s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2592s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0657s for 8192 events => throughput is 1.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0887s for 8192 events => throughput is 9.23E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9390s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2156s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7234s for 90112 events => throughput is 1.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6655s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6760s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9895s for 90112 events => throughput is 9.11E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.277475e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.369009e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.305282e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.3954s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.5272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8582s for 90112 events => throughput is 1.05E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.086568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.279177e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099441e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.4677s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1134s for 8192 events => throughput is 7.22E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.9514s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7052s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2463s for 90112 events => throughput is 7.23E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.364043e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.347069e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358102981259E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4978s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4903s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6759s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981259E-002) differ by less than 2E-4 (1.8571735260763944e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981231E-002) differ by less than 2E-4 (1.8571730819871846e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.5310s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4486s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0824s for 90112 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0449s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0215s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.86E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634174E-002) differ by less than 2E-4 (1.1094947183210024e-10) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634160E-002) differ by less than 2E-4 (1.109495828544027e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.136935e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624283e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.155884e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.218126e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.677853e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.599538e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.303184e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.232652e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.678438e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.616286e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.841214e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243703e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.663129e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.609732e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.401211e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.728637e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 61ceb23aba..930476d789 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:14:03 +DATE: 2024-01-30_06:13:52 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/va [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 3.6756s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3091s - [COUNTERS] Fortran MEs ( 1 ) : 3.3665s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3035s + [COUNTERS] Fortran MEs ( 1 ) : 4.4093s for 8192 events => throughput is 1.86E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/va [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 3.5983s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s - [COUNTERS] Fortran MEs ( 1 ) : 3.3663s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2934s + [COUNTERS] Fortran MEs ( 1 ) : 4.4261s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/v [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 38.5532s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5391s - [COUNTERS] Fortran MEs ( 1 ) : 37.0141s for 90112 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8146s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0980s + [COUNTERS] Fortran MEs ( 1 ) : 48.7166s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 7.6872s - [COUNTERS] Fortran Overhead ( 0 ) : 3.9109s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7763s for 8192 events => throughput is 2.17E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.5114s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8392s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6722s for 8192 events => throughput is 1.75E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 46.7704s - [COUNTERS] Fortran Overhead ( 0 ) : 5.1883s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.5821s for 90112 events => throughput is 2.17E+03 events/s + [COUNTERS] PROGRAM TOTAL : 57.8440s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6165s + [COUNTERS] CudaCpp MEs ( 2 ) : 51.2275s for 90112 events => throughput is 1.76E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421150E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.230275e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.804400e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.229678e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.805886e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 3.5098s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8498s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6599s for 8192 events => throughput is 4.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0600s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6542s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4058s for 8192 events => throughput is 3.41E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 22.3971s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1665s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.2306s for 90112 events => throughput is 4.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6354s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3949s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2405s for 90112 events => throughput is 3.43E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421156E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.069087e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.603718e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.072108e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.615187e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.6321s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9441s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6880s for 8192 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3469s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0424s for 8192 events => throughput is 7.86E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 11.1120s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5133s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.5988s for 90112 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 14.6028s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0873s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.5154s for 90112 events => throughput is 7.83E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.214250e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.099934e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.056733e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.1181s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1916s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9265s for 8192 events => throughput is 8.84E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 13.1007s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9669s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.1338s for 90112 events => throughput is 8.89E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.168199e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.218576e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.198606e+03 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.6312s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4569s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1743s for 8192 events => throughput is 6.98E+03 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 16.1783s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2363s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.9420s for 90112 events => throughput is 6.96E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.039911e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.063323e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 3.9935s - [COUNTERS] Fortran Overhead ( 0 ) : 3.8815s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1121s for 8192 events => throughput is 7.31E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8345s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 3.8764s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6410s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2354s for 90112 events => throughput is 7.29E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.9375s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5725s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3650s for 90112 events => throughput is 2.47E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421164E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421166E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.308550e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.275540e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.526993e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.510631e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.247957e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.114953e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.040930e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.167963e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241118e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.105704e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.230304e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168745e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245366e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.099883e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.390859e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.425014e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index b70116e765..5e8ad575df 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:18:17 +DATE: 2024-01-30_06:18:23 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/va [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 3.6085s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s - [COUNTERS] Fortran MEs ( 1 ) : 3.3745s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2990s + [COUNTERS] Fortran MEs ( 1 ) : 4.4236s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/va [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 5.9463s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5818s - [COUNTERS] Fortran MEs ( 1 ) : 3.3644s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s + [COUNTERS] Fortran MEs ( 1 ) : 4.4253s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/v [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 38.5378s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5009s - [COUNTERS] Fortran MEs ( 1 ) : 37.0369s for 90112 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8315s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1047s + [COUNTERS] Fortran MEs ( 1 ) : 48.7267s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396734396344E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 6.9160s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5393s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3767s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7124s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4408s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2715s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396734396344E-004) differ by less than 4E-4 (3.2919516625984357e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396352122325E-004) differ by less than 4E-4 (3.2814141017745158e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774245774590E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 41.9342s - [COUNTERS] Fortran Overhead ( 0 ) : 4.8227s - [COUNTERS] CudaCpp MEs ( 2 ) : 37.1115s for 90112 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.4678s + [COUNTERS] Fortran Overhead ( 0 ) : 6.2349s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2329s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774245774590E-004) differ by less than 4E-4 (3.0687291214803736e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774048965294E-004) differ by less than 4E-4 (3.056275773571926e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.484000e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.973797e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.486894e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.974372e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277390210387336E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.8879s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0502s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8377s for 8192 events => throughput is 9.78E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6573s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4642s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1930s for 8192 events => throughput is 6.87E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277390210387336E-004) differ by less than 4E-4 (3.1121143240220306e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277387698033752E-004) differ by less than 4E-4 (3.0428601303089664e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803772192716622E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 11.5533s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3160s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.2373s for 90112 events => throughput is 9.76E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.3220s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2267s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0952s for 90112 events => throughput is 6.88E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803772192716622E-004) differ by less than 4E-4 (2.9388193774071425e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803770691658365E-004) differ by less than 4E-4 (2.8438380874629132e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.002165e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.126754e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.003724e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.170734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277391351528001E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.9282s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5762s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3520s for 8192 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3306s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8070s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5236s for 8192 events => throughput is 1.56E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277391351528001E-004) differ by less than 4E-4 (3.1435703964355355e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774950753991E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 5.7143s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8410s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8733s for 90112 events => throughput is 2.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3793s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5829s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7964s for 90112 events => throughput is 1.55E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774950753991E-004) differ by less than 4E-4 (3.1133375519853956e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.394756e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.587083e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.594558e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 1.2103s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7457s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4646s for 8192 events => throughput is 1.76E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 7.6352s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.1134s for 90112 events => throughput is 1.76E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.751909e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383836e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.760230e+04 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 1.4750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8777s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5973s for 8192 events => throughput is 1.37E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396133530942E-004) differ by less than 4E-4 (3.2753885288450135e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 9.1796s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7011s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4785s for 90112 events => throughput is 1.39E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803777739454609E-004) differ by less than 4E-4 (3.2897959809652377e-06) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.411903e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.410632e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277395812950292E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.6895s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6341s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0553s for 8192 events => throughput is 1.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7967s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7754s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277395812950292E-004) differ by less than 4E-4 (3.266551574121479e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277400478491265E-004) differ by less than 4E-4 (3.395159378305479e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803778304590137E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 3.1923s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5815s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6107s for 90112 events => throughput is 1.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7835s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5473s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803778304590137E-004) differ by less than 4E-4 (3.325555619992926e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.432211783227501e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.476169e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.582485e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.004299e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.942798e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.720996e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.492976e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.318368e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638150e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.714749e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.493239e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.074459e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638925e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.709070e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.453709e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.434688e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.527726e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index f1516a5257..a372850ebe 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:21:38 +DATE: 2024-01-30_06:21:53 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/va [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 3.5990s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2317s - [COUNTERS] Fortran MEs ( 1 ) : 3.3672s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7321s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3006s + [COUNTERS] Fortran MEs ( 1 ) : 4.4315s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/va [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 3.5984s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2322s - [COUNTERS] Fortran MEs ( 1 ) : 3.3662s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s + [COUNTERS] Fortran MEs ( 1 ) : 4.4410s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/v [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 38.5218s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4972s - [COUNTERS] Fortran MEs ( 1 ) : 37.0246s for 90112 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8810s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1085s + [COUNTERS] Fortran MEs ( 1 ) : 48.7725s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 7.8050s - [COUNTERS] Fortran Overhead ( 0 ) : 3.9849s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8201s for 8192 events => throughput is 2.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.6340s + [COUNTERS] Fortran Overhead ( 0 ) : 4.9027s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7313s for 8192 events => throughput is 1.73E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 51.7126s - [COUNTERS] Fortran Overhead ( 0 ) : 9.5711s - [COUNTERS] CudaCpp MEs ( 2 ) : 42.1415s for 90112 events => throughput is 2.14E+03 events/s + [COUNTERS] PROGRAM TOTAL : 58.7075s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6716s + [COUNTERS] CudaCpp MEs ( 2 ) : 52.0359s for 90112 events => throughput is 1.73E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725813026107E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.188928e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.784786e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.187259e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.785841e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 11.3935s - [COUNTERS] Fortran Overhead ( 0 ) : 9.7620s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6315s for 8192 events => throughput is 5.02E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0010s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6173s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3837s for 8192 events => throughput is 3.44E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 26.6256s - [COUNTERS] Fortran Overhead ( 0 ) : 8.6133s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.0123s for 90112 events => throughput is 5.00E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6721s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4448s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2273s for 90112 events => throughput is 3.44E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725816246315E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.138707e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.519557e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.171098e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.539213e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 5.6899s - [COUNTERS] Fortran Overhead ( 0 ) : 5.0088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6811s for 8192 events => throughput is 1.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3065s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0298s for 8192 events => throughput is 7.96E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.1637s - [COUNTERS] Fortran Overhead ( 0 ) : 5.6687s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.4950s for 90112 events => throughput is 1.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 14.5206s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0822s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.4384s for 90112 events => throughput is 7.88E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.236682e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.081577e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.110336e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.0953s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1807s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9146s for 8192 events => throughput is 8.96E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 13.0092s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9623s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.0469s for 90112 events => throughput is 8.97E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.305280e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.238440e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.278381e+03 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.6707s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4755s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1952s for 8192 events => throughput is 6.85E+03 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 16.3107s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2346s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0761s for 90112 events => throughput is 6.89E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.979300e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.981216e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277293084696E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2556s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1128s for 8192 events => throughput is 7.27E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8344s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084696E-004) differ by less than 2E-4 (5.035738492864539e-10) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084701E-004) differ by less than 2E-4 (5.03573627241849e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725738731031E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 5.3501s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1104s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2398s for 90112 events => throughput is 7.27E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.9283s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5651s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3632s for 90112 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731031E-004) differ by less than 2E-4 (6.131546381737962e-10) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131540830622839e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.296213e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.286633e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.548555e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.522365e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245057e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.122442e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.025106e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.148040e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.247798e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.112221e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.229717e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.164785e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.248252e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.108117e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.380128e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430780e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index bbe2d399e3..bc47a109df 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-28_14:29:48 +DATE: 2024-01-30_06:27:59 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 73.0391s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4667s - [COUNTERS] Fortran MEs ( 1 ) : 72.5725s for 8192 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.9143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 101.4320s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 72.9463s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3446s - [COUNTERS] Fortran MEs ( 1 ) : 72.6017s for 8192 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.8572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4740s + [COUNTERS] Fortran MEs ( 1 ) : 101.3832s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 800.6448s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1320s - [COUNTERS] Fortran MEs ( 1 ) : 797.5128s for 90112 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1118.0575s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3730s + [COUNTERS] Fortran MEs ( 1 ) : 1113.6844s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 175.0060s - [COUNTERS] Fortran Overhead ( 0 ) : 80.0186s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.9874s for 8192 events => throughput is 8.62E+01 events/s + [COUNTERS] PROGRAM TOTAL : 222.3358s + [COUNTERS] Fortran Overhead ( 0 ) : 102.7450s + [COUNTERS] CudaCpp MEs ( 2 ) : 119.5908s for 8192 events => throughput is 6.85E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1135.2325s - [COUNTERS] Fortran Overhead ( 0 ) : 82.5281s - [COUNTERS] CudaCpp MEs ( 2 ) : 1052.7045s for 90112 events => throughput is 8.56E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1439.6055s + [COUNTERS] Fortran Overhead ( 0 ) : 107.7197s + [COUNTERS] CudaCpp MEs ( 2 ) : 1331.8857s for 90112 events => throughput is 6.77E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813950E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.031958e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.948640e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.030043e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.570768e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 96.4180s - [COUNTERS] Fortran Overhead ( 0 ) : 51.6087s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.8093s for 8192 events => throughput is 1.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 115.4155s + [COUNTERS] Fortran Overhead ( 0 ) : 52.8722s + [COUNTERS] CudaCpp MEs ( 2 ) : 62.5433s for 8192 events => throughput is 1.31E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 530.6261s - [COUNTERS] Fortran Overhead ( 0 ) : 39.2305s - [COUNTERS] CudaCpp MEs ( 2 ) : 491.3956s for 90112 events => throughput is 1.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 742.6112s + [COUNTERS] Fortran Overhead ( 0 ) : 56.7177s + [COUNTERS] CudaCpp MEs ( 2 ) : 685.8936s for 90112 events => throughput is 1.31E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.249882e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.569503e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.266535e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.568080e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.7969s - [COUNTERS] Fortran Overhead ( 0 ) : 31.7782s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.0187s for 8192 events => throughput is 4.31E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.0772s + [COUNTERS] Fortran Overhead ( 0 ) : 24.7915s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2857s for 8192 events => throughput is 2.90E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 228.8024s - [COUNTERS] Fortran Overhead ( 0 ) : 18.4929s - [COUNTERS] CudaCpp MEs ( 2 ) : 210.3095s for 90112 events => throughput is 4.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 340.2848s + [COUNTERS] Fortran Overhead ( 0 ) : 28.6537s + [COUNTERS] CudaCpp MEs ( 2 ) : 311.6311s for 90112 events => throughput is 2.89E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.297354e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.398486e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.399363e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 46.8647s + [COUNTERS] Fortran Overhead ( 0 ) : 21.6683s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.1965s for 8192 events => throughput is 3.25E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 302.3706s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4811s + [COUNTERS] CudaCpp MEs ( 2 ) : 276.8895s for 90112 events => throughput is 3.25E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.882449e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.301798e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.889860e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 50.0231s + [COUNTERS] Fortran Overhead ( 0 ) : 24.6335s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.3895s for 8192 events => throughput is 3.23E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 308.5181s + [COUNTERS] Fortran Overhead ( 0 ) : 28.4720s + [COUNTERS] CudaCpp MEs ( 2 ) : 280.0461s for 90112 events => throughput is 3.22E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.386684e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.385729e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435825E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 11.1780s - [COUNTERS] Fortran Overhead ( 0 ) : 7.3812s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7969s for 8192 events => throughput is 2.16E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2467s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1625s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0842s for 8192 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435825E-006) differ by less than 2E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (1.9984014443252818e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 51.7876s - [COUNTERS] Fortran Overhead ( 0 ) : 10.0015s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.7860s for 90112 events => throughput is 2.16E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.9518s + [COUNTERS] Fortran Overhead ( 0 ) : 7.0338s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9179s for 90112 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813960E-007) differ by less than 2E-14 (8.881784197001252e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.170313e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.528868e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.233689e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.249701e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.570446e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.231891e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.473042e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.557033e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.567032e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.244700e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.557123e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.446855e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.564602e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.214530e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.127454e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.244468e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8016b0a70e..c35aa0a017 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-28_15:32:38 +DATE: 2024-01-30_07:59:04 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 72.8549s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s - [COUNTERS] Fortran MEs ( 1 ) : 72.5108s for 8192 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.8466s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4779s + [COUNTERS] Fortran MEs ( 1 ) : 101.3687s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 121.1698s - [COUNTERS] Fortran Overhead ( 0 ) : 48.6282s - [COUNTERS] Fortran MEs ( 1 ) : 72.5416s for 8192 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.7818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4752s + [COUNTERS] Fortran MEs ( 1 ) : 101.3066s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 810.5414s - [COUNTERS] Fortran Overhead ( 0 ) : 13.4189s - [COUNTERS] Fortran MEs ( 1 ) : 797.1225s for 90112 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1118.6550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3767s + [COUNTERS] Fortran MEs ( 1 ) : 1114.2783s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768412243468E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 162.3306s - [COUNTERS] Fortran Overhead ( 0 ) : 74.4438s - [COUNTERS] CudaCpp MEs ( 2 ) : 87.8867s for 8192 events => throughput is 9.32E+01 events/s + [COUNTERS] PROGRAM TOTAL : 205.5797s + [COUNTERS] Fortran Overhead ( 0 ) : 95.6059s + [COUNTERS] CudaCpp MEs ( 2 ) : 109.9738s for 8192 events => throughput is 7.45E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768412243468E-006) differ by less than 4E-4 (0.00014260261802601093) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768374083672E-006) differ by less than 4E-4 (0.00014259935458071915) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361436028353404E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1056.0720s - [COUNTERS] Fortran Overhead ( 0 ) : 90.0693s - [COUNTERS] CudaCpp MEs ( 2 ) : 966.0027s for 90112 events => throughput is 9.33E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1305.3326s + [COUNTERS] Fortran Overhead ( 0 ) : 98.5377s + [COUNTERS] CudaCpp MEs ( 2 ) : 1206.7949s for 90112 events => throughput is 7.47E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361436028353404E-007) differ by less than 4E-4 (0.0001404536136035972) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435710758843E-007) differ by less than 4E-4 (0.0001404387438554977) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111130e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.692219e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111642e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.699275e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694767325083535E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 39.5912s - [COUNTERS] Fortran Overhead ( 0 ) : 18.2165s - [COUNTERS] CudaCpp MEs ( 2 ) : 21.3747s for 8192 events => throughput is 3.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 52.0497s + [COUNTERS] Fortran Overhead ( 0 ) : 24.6638s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3859s for 8192 events => throughput is 2.99E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694767325083535E-006) differ by less than 4E-4 (0.00014250964355011497) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694765360831655E-006) differ by less than 4E-4 (0.00014234165972015766) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361431788761647E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 292.9265s - [COUNTERS] Fortran Overhead ( 0 ) : 55.9222s - [COUNTERS] CudaCpp MEs ( 2 ) : 237.0042s for 90112 events => throughput is 3.80E+02 events/s + [COUNTERS] PROGRAM TOTAL : 336.4854s + [COUNTERS] Fortran Overhead ( 0 ) : 29.3396s + [COUNTERS] CudaCpp MEs ( 2 ) : 307.1459s for 90112 events => throughput is 2.93E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361431788761647E-007) differ by less than 4E-4 (0.00014025511631077237) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429212586563E-007) differ by less than 4E-4 (0.00014013450003202976) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.620389e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.371429e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.592869e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.391230e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694766288507467E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 23.5101s - [COUNTERS] Fortran Overhead ( 0 ) : 13.7778s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.7323s for 8192 events => throughput is 8.42E+02 events/s + [COUNTERS] PROGRAM TOTAL : 27.0721s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6637s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.4085s for 8192 events => throughput is 5.69E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694766288507467E-006) differ by less than 4E-4 (0.00014242099503225525) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361431260588202E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 116.7734s - [COUNTERS] Fortran Overhead ( 0 ) : 10.7208s - [COUNTERS] CudaCpp MEs ( 2 ) : 106.0525s for 90112 events => throughput is 8.50E+02 events/s + [COUNTERS] PROGRAM TOTAL : 174.3265s + [COUNTERS] Fortran Overhead ( 0 ) : 16.5080s + [COUNTERS] CudaCpp MEs ( 2 ) : 157.8185s for 90112 events => throughput is 5.71E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361431260588202E-007) differ by less than 4E-4 (0.00014023038727883907) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.049493e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.735262e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.739370e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 23.8611s + [COUNTERS] Fortran Overhead ( 0 ) : 11.1738s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.6873s for 8192 events => throughput is 6.46E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 154.4819s + [COUNTERS] Fortran Overhead ( 0 ) : 15.1638s + [COUNTERS] CudaCpp MEs ( 2 ) : 139.3180s for 90112 events => throughput is 6.47E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.672552e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.056012e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.678478e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 25.4092s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6834s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.7257s for 8192 events => throughput is 6.44E+02 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768276769753E-006) differ by less than 4E-4 (0.00014259103224434355) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 156.3015s + [COUNTERS] Fortran Overhead ( 0 ) : 16.4754s + [COUNTERS] CudaCpp MEs ( 2 ) : 139.8261s for 90112 events => throughput is 6.44E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435948756818E-007) differ by less than 4E-4 (0.00014044988689865257) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.776081e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.750726e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768512039880E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 6.0902s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2956s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7947s for 8192 events => throughput is 4.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5003s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4984s for 8192 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768512039880E-006) differ by less than 4E-4 (0.00014261115266633873) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694770708194997E-006) differ by less than 4E-4 (0.00014279896898039546) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361438292717214E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 26.7087s - [COUNTERS] Fortran Overhead ( 0 ) : 6.8261s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.8826s for 90112 events => throughput is 4.53E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.2881s + [COUNTERS] Fortran Overhead ( 0 ) : 5.8695s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4186s for 90112 events => throughput is 1.66E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361438292717214E-007) differ by less than 4E-4 (0.00014055963090697787) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361443477565656E-007) differ by less than 4E-4 (0.00014080238503022535) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.539126e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.635547e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.545967e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.633264e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.382515e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.309560e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.535405e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.405493e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.397695e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.341562e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.072776e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.341458e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.409435e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.336833e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.095250e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.413620e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index a5bfae67ec..b9faa14c51 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-28_16:27:50 +DATE: 2024-01-30_09:07:55 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 73.1885s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3462s - [COUNTERS] Fortran MEs ( 1 ) : 72.8423s for 8192 events => throughput is 1.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.9697s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] Fortran MEs ( 1 ) : 101.4926s for 8192 events => throughput is 8.07E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/v [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 74.1853s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6252s - [COUNTERS] Fortran MEs ( 1 ) : 72.5601s for 8192 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.6914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4764s + [COUNTERS] Fortran MEs ( 1 ) : 101.2150s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 802.1686s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9649s - [COUNTERS] Fortran MEs ( 1 ) : 799.2037s for 90112 events => throughput is 1.13E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1118.2550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3831s + [COUNTERS] Fortran MEs ( 1 ) : 1113.8719s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 220.0110s - [COUNTERS] Fortran Overhead ( 0 ) : 83.6850s - [COUNTERS] CudaCpp MEs ( 2 ) : 136.3260s for 8192 events => throughput is 6.01E+01 events/s + [COUNTERS] PROGRAM TOTAL : 224.2502s + [COUNTERS] Fortran Overhead ( 0 ) : 103.5045s + [COUNTERS] CudaCpp MEs ( 2 ) : 120.7457s for 8192 events => throughput is 6.78E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1142.6014s - [COUNTERS] Fortran Overhead ( 0 ) : 94.5505s - [COUNTERS] CudaCpp MEs ( 2 ) : 1048.0509s for 90112 events => throughput is 8.60E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1439.9019s + [COUNTERS] Fortran Overhead ( 0 ) : 107.2148s + [COUNTERS] CudaCpp MEs ( 2 ) : 1332.6870s for 90112 events => throughput is 6.76E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436275882778E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.027921e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.977492e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.024344e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.962894e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 78.7418s - [COUNTERS] Fortran Overhead ( 0 ) : 35.2581s - [COUNTERS] CudaCpp MEs ( 2 ) : 43.4836s for 8192 events => throughput is 1.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 114.2875s + [COUNTERS] Fortran Overhead ( 0 ) : 54.1504s + [COUNTERS] CudaCpp MEs ( 2 ) : 60.1370s for 8192 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 521.0989s - [COUNTERS] Fortran Overhead ( 0 ) : 41.7682s - [COUNTERS] CudaCpp MEs ( 2 ) : 479.3307s for 90112 events => throughput is 1.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 713.5498s + [COUNTERS] Fortran Overhead ( 0 ) : 58.1042s + [COUNTERS] CudaCpp MEs ( 2 ) : 655.4456s for 90112 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436284111587E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.353681e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.524488e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.355520e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.529646e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 34.2475s - [COUNTERS] Fortran Overhead ( 0 ) : 15.3222s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.9253s for 8192 events => throughput is 4.33E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.7719s + [COUNTERS] Fortran Overhead ( 0 ) : 23.4236s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3483s for 8192 events => throughput is 3.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 240.9335s - [COUNTERS] Fortran Overhead ( 0 ) : 34.1423s - [COUNTERS] CudaCpp MEs ( 2 ) : 206.7912s for 90112 events => throughput is 4.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 326.3386s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1544s + [COUNTERS] CudaCpp MEs ( 2 ) : 299.1842s for 90112 events => throughput is 3.01E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.593063e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.567666e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.597479e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 45.3921s + [COUNTERS] Fortran Overhead ( 0 ) : 20.5216s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8705s for 8192 events => throughput is 3.29E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 301.9509s + [COUNTERS] Fortran Overhead ( 0 ) : 24.4062s + [COUNTERS] CudaCpp MEs ( 2 ) : 277.5446s for 90112 events => throughput is 3.25E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.101718e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.582873e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.081457e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 48.9503s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9070s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.0433s for 8192 events => throughput is 3.27E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 301.8849s + [COUNTERS] Fortran Overhead ( 0 ) : 27.8392s + [COUNTERS] CudaCpp MEs ( 2 ) : 274.0457s for 90112 events => throughput is 3.29E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.501339e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.509355e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100942770682E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 119.9683s - [COUNTERS] Fortran Overhead ( 0 ) : 115.7986s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1696s for 8192 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5767s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8630s for 8192 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770682E-006) differ by less than 2E-4 (2.279226807289092e-10) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.279223476620018e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436157495363E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 56.4992s - [COUNTERS] Fortran Overhead ( 0 ) : 10.4671s - [COUNTERS] CudaCpp MEs ( 2 ) : 46.0321s for 90112 events => throughput is 1.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.0811s + [COUNTERS] Fortran Overhead ( 0 ) : 6.5876s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4935s for 90112 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495363E-007) differ by less than 2E-4 (6.173750399796063e-11) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173717093105324e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.983806e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.427839e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.003315e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.087147e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.341441e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.109838e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.390860e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.157967e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.328426e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.106789e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.240330e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.114403e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.338967e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111816e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.083611e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.650481e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 7119205a5c..1fb13570ed 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-28_14:28:43 +DATE: 2024-01-30_06:26:25 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.4291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3683s - [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3322s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s + [COUNTERS] Fortran MEs ( 1 ) : 0.0741s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2600s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1993s - [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3239s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2500s + [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8397s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1741s - [COUNTERS] Fortran MEs ( 1 ) : 0.6656s for 90112 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4169s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6004s + [COUNTERS] Fortran MEs ( 1 ) : 0.8166s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3408s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3380s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0825s for 8192 events => throughput is 9.92E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0097s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2428s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7669s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6157s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9063s for 90112 events => throughput is 9.94E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.199596e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.011343e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.199561e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.009603e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2699s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2365s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0334s for 8192 events => throughput is 2.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.5755s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2065s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3690s for 90112 events => throughput is 2.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1314s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6642s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4672s for 90112 events => throughput is 1.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.473704e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.951695e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.482626e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.960178e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2371s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2198s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3038s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.3894s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1994s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1900s for 90112 events => throughput is 4.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2754s for 90112 events => throughput is 3.27E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.838763e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.305728e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.851433e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.308522e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2978s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0219s for 8192 events => throughput is 3.74E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.8887s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6471s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2415s for 90112 events => throughput is 3.73E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.792718e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.861917e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.3207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0189s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3622s for 90112 events => throughput is 2.49E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.535117e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.517520e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.6879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6872s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0638s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0555s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 90112 events => throughput is 1.08E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.535376e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.131781e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.380880e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.511409e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.374806e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.787335e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.380768e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.782273e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 828e2e75d4..4985f151b2 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-28_14:29:09 +DATE: 2024-01-30_06:26:56 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.2648s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2041s - [COUNTERS] Fortran MEs ( 1 ) : 0.0607s for 8192 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3277s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2529s + [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2760s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2153s - [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3225s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2485s + [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8359s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1714s - [COUNTERS] Fortran MEs ( 1 ) : 0.6644s for 90112 events => throughput is 1.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4071s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5926s + [COUNTERS] Fortran MEs ( 1 ) : 0.8145s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050315080224007] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3172s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2605s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0567s for 8192 events => throughput is 1.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4015s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3277s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050315080224007) differ by less than 4E-4 (6.997791349716564e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050314903825744) differ by less than 4E-4 (7.065505747139156e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182183053122] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8506s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2301s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6205s for 90112 events => throughput is 1.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5139s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7000s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8139s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182183053122) differ by less than 4E-4 (2.135493093469165e-08) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801181770186087) differ by less than 4E-4 (4.0292758352045155e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.481535e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.131056e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.484510e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.131141e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313441464579] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2412s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2217s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 8192 events => throughput is 3.04E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313441464579) differ by less than 4E-4 (7.626865614618339e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310835231938) differ by less than 4E-4 (8.627325996934943e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801180175915433] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.4109s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1913s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2196s for 90112 events => throughput is 4.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9246s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2778s for 90112 events => throughput is 3.24E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801180175915433) differ by less than 4E-4 (1.1342047256945875e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177817838580) differ by less than 4E-4 (2.2158326773435988e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.159063e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.299610e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.116683e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.290596e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313305934997] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2207s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2111s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2672s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313305934997) differ by less than 4E-4 (7.678891660312104e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179852567595] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.2865s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1811s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1055s for 90112 events => throughput is 8.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7784s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1459s for 90112 events => throughput is 6.18E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801179852567595) differ by less than 4E-4 (1.2825213757672316e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.738486e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.309270e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.778124e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.331612e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2783s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2663s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0120s for 8192 events => throughput is 6.84E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.7659s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6337s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1322s for 90112 events => throughput is 6.82E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.987405e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.079276e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2869s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050317064561834) differ by less than 4E-4 (6.236059127973093e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.8266s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6389s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1877s for 90112 events => throughput is 4.80E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674269399215e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.948471e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.943070e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.6865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050319131407651) differ by less than 4E-4 (5.442654378295941e-07) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0580s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.646730e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.486298e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.789487e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.699254e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.776807e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.780761e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.361160e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.995376e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index d73dc7c80a..44df8a9e3d 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-28_14:29:33 +DATE: 2024-01-30_06:27:27 -On uan03 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,8 +50,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.2655s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2046s - [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3260s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2519s + [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,8 +75,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/val [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2621s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2013s - [COUNTERS] Fortran MEs ( 1 ) : 0.0608s for 8192 events => throughput is 1.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3236s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2490s + [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,8 +100,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/va [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8381s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1735s - [COUNTERS] Fortran MEs ( 1 ) : 0.6646s for 90112 events => throughput is 1.36E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5940s + [COUNTERS] Fortran MEs ( 1 ) : 0.8130s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,8 +125,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3436s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2742s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3367s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0829s for 8192 events => throughput is 9.88E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0075s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2444s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7631s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6225s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7110s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9116s for 90112 events => throughput is 9.89E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608801) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.196578e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.000214e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.195632e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.992747e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,8 +201,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2696s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2368s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0328s for 8192 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.5752s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2150s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3602s for 90112 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6727s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4735s for 90112 events => throughput is 1.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608804) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.505829e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.936317e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.512763e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.928004e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,8 +277,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2378s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2205s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -310,8 +310,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.3839s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1946s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1893s for 90112 events => throughput is 4.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9238s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2724s for 90112 events => throughput is 3.31E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,18 +332,166 @@ OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.829459e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.358469e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.842597e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.409112e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.8897s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.861136e+05 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.873456e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.3231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 8192 events => throughput is 2.42E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0365s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6643s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3722s for 90112 events => throughput is 2.42E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.448669e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.422091e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.6889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029699) differ by less than 2E-4 (3.329714282074292e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0663s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 90112 events => throughput is 1.10E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182637219937) differ by less than 2E-4 (5.227208665914418e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.534715e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.123919e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.382422e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.503129e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.385930e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.826918e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.379565e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.789199e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED From fc19f843319cd281459e01fb5ae9f43dcffa566b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 31 Jan 2024 16:15:26 +0100 Subject: [PATCH 93/96] [jt774] regenerate all code with Olivier's patch - only launch_plugin.py changes, while c++/cuda/hip is unchanged --- .../ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt | 18 +++--- .../ee_mumu.mad/bin/internal/launch_plugin.py | 2 +- .../CODEGEN_cudacpp_ee_mumu_log.txt | 12 ++-- .../gg_tt.mad/CODEGEN_mad_gg_tt_log.txt | 18 +++--- .../gg_tt.mad/bin/internal/launch_plugin.py | 2 +- .../gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt | 12 ++-- .../gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt | 24 ++++---- .../bin/internal/launch_plugin.py | 2 +- .../gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt | 20 +++---- .../gg_ttg.mad/bin/internal/launch_plugin.py | 2 +- .../gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt | 14 ++--- .../gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt | 22 +++---- .../gg_ttgg.mad/bin/internal/launch_plugin.py | 2 +- .../CODEGEN_cudacpp_gg_ttgg_log.txt | 14 ++--- .../gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt | 22 +++---- .../bin/internal/launch_plugin.py | 2 +- .../CODEGEN_cudacpp_gg_ttggg_log.txt | 16 ++--- .../gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt | 22 +++---- .../gq_ttq.mad/bin/internal/launch_plugin.py | 2 +- .../gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt | 14 ++--- .../CODEGEN_cudacpp_heft_gg_h_log.txt | 14 +++-- .../CODEGEN_mad_pp_tt012j_log.txt | 58 +++++++++---------- .../bin/internal/launch_plugin.py | 2 +- 23 files changed, 161 insertions(+), 155 deletions(-) diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index b79a051c06..a484a3ce73 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00582122802734375  +DEBUG: model prefixing takes 0.0055043697357177734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.104 s +Wrote files for 8 helas calls in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.214 s +ALOHA: aloha creates 3 routines in 0.198 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.272 s +ALOHA: aloha creates 7 routines in 0.252 s FFV1 FFV1 FFV2 @@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.986s -user 0m1.756s -sys 0m0.219s +real 0m1.882s +user 0m1.658s +sys 0m0.191s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index fe21d36197..2764fbfcfb 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005848884582519531  +DEBUG: model prefixing takes 0.00559234619140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -154,7 +154,7 @@ INFO: Checking for minimal orders which gives processes. INFO: Please specify coupling orders to bypass this step. INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Process has 2 diagrams -1 processes with 2 diagrams generated in 0.005 s +1 processes with 2 diagrams generated in 0.004 s Total: 1 processes with 2 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_ee_mumu Load PLUGIN.CUDACPP_OUTPUT @@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.282 s +ALOHA: aloha creates 4 routines in 0.266 s FFV1 FFV1 FFV2 @@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.695s -user 0m0.628s -sys 0m0.057s +real 0m0.654s +user 0m0.591s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index f6226e7392..5782086b56 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005857229232788086  +DEBUG: model prefixing takes 0.005559206008911133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.108 s +Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.154 s +ALOHA: aloha creates 2 routines in 0.146 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.139 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.820s -user 0m1.561s -sys 0m0.253s +real 0m1.699s +user 0m1.467s +sys 0m0.225s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 443c1e7506..9d7cc87630 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005812883377075195  +DEBUG: model prefixing takes 0.005457878112792969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ WEIGHTED<=2 @1 INFO: Process has 3 diagrams -1 processes with 3 diagrams generated in 0.009 s +1 processes with 3 diagrams generated in 0.008 s Total: 1 processes with 3 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt Load PLUGIN.CUDACPP_OUTPUT @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.152 s +ALOHA: aloha creates 2 routines in 0.142 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.561s -user 0m0.502s -sys 0m0.055s +real 0m0.546s +user 0m0.467s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index c37391aef7..f5287cc1ca 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005814790725708008  +DEBUG: model prefixing takes 0.005505561828613281  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.021 s +1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,15 +217,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.045 s -Wrote files for 46 helas calls in 0.257 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.346 s +ALOHA: aloha creates 5 routines in 0.330 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.329 s +ALOHA: aloha creates 10 routines in 0.311 s VVV1 VVV1 FFV1 @@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.425s -user 0m2.147s -sys 0m0.279s -Code generation completed in 2 seconds +real 0m2.369s +user 0m2.050s +sys 0m0.252s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 7df8225f38..ffc3d1d3ef 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005897045135498047  +DEBUG: model prefixing takes 0.005362510681152344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.040 s -Wrote files for 36 helas calls in 0.157 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Wrote files for 36 helas calls in 0.149 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.344 s +ALOHA: aloha creates 5 routines in 0.331 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -252,9 +252,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.290s -user 0m2.041s -sys 0m0.249s +real 0m2.177s +user 0m1.931s +sys 0m0.233s Code generation completed in 3 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index 74be077c7e..f034db4427 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005811929702758789  +DEBUG: model prefixing takes 0.00534820556640625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.023 s +1 processes with 16 diagrams generated in 0.022 s Total: 1 processes with 16 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.346 s +ALOHA: aloha creates 5 routines in 0.325 s VVV1 VVV1 FFV1 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.831s -user 0m0.769s -sys 0m0.052s +real 0m0.783s +user 0m0.718s +sys 0m0.051s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 8574f56894..0da89f1729 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005876302719116211  +DEBUG: model prefixing takes 0.005736112594604492  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.169 s +1 processes with 123 diagrams generated in 0.160 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.451 s -Wrote files for 222 helas calls in 0.735 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s +Wrote files for 222 helas calls in 0.683 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.359 s +ALOHA: aloha creates 5 routines in 0.328 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.338 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -255,10 +255,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.480s -user 0m3.191s -sys 0m0.282s -Code generation completed in 3 seconds +real 0m3.254s +user 0m3.011s +sys 0m0.236s +Code generation completed in 4 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index dcf971696c..9ebee16fdf 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005830526351928711  +DEBUG: model prefixing takes 0.005690574645996094  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.168 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.452 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.340 s +ALOHA: aloha creates 5 routines in 0.323 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.536s -user 0m1.455s -sys 0m0.066s +real 0m1.851s +user 0m1.398s +sys 0m0.060s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index bc4cb5e760..37ad313b62 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005844831466674805  +DEBUG: model prefixing takes 0.005320072174072266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 2.003 s +1 processes with 1240 diagrams generated in 1.852 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 7.023 s -Wrote files for 2281 helas calls in 19.759 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.532 s +Wrote files for 2281 helas calls in 18.428 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.339 s +ALOHA: aloha creates 5 routines in 0.343 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.331 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -257,10 +257,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m31.161s -user 0m30.606s -sys 0m0.431s -Code generation completed in 31 seconds +real 0m29.153s +user 0m28.584s +sys 0m0.440s +Code generation completed in 29 seconds ************************************************************ * * * W E L C O M E to * diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 0ee2c63a79..382962d284 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005843639373779297  +DEBUG: model prefixing takes 0.005497932434082031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.996 s +1 processes with 1240 diagrams generated in 1.865 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 7.005 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.712 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.376 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.898s -user 0m13.688s -sys 0m0.137s -Code generation completed in 14 seconds +real 0m13.123s +user 0m12.875s +sys 0m0.149s +Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 59ac167900..cfbc521449 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00553131103515625  +DEBUG: model prefixing takes 0.0053598880767822266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.083 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.033 s -Wrote files for 32 helas calls in 0.241 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.043 s +Wrote files for 32 helas calls in 0.217 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.156 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.141 s +ALOHA: aloha creates 4 routines in 0.130 s FFV1 FFV1 FFV1 @@ -294,9 +294,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.049s -user 0m1.786s -sys 0m0.254s +real 0m1.946s +user 0m1.693s +sys 0m0.230s Code generation completed in 2 seconds ************************************************************ * * diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index 5ad4ee5be1..fe303ed372 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00584101676940918  +DEBUG: model prefixing takes 0.0057065486907958984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.083 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -206,12 +206,12 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.157 s +ALOHA: aloha creates 2 routines in 0.141 s FFV1 FFV1 FFV1 @@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.692s -user 0m0.624s -sys 0m0.058s +real 0m0.652s +user 0m0.583s +sys 0m0.057s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 17b7cd7789..1054438636 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -62,6 +62,12 @@ set auto_convert_model T save options auto_convert_model save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft +INFO: reload from .py file +INFO: load particles +INFO: load vertices +WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +DEBUG: model prefixing takes 0.005749940872192383  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -153,7 +159,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.065 s +ALOHA: aloha creates 1 routines in 0.069 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -165,7 +171,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.448s -user 0m0.390s -sys 0m0.053s +real 0m0.616s +user 0m0.387s +sys 0m0.049s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index ca1a8be0ce..e01d29e02f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005836963653564453  +DEBUG: model prefixing takes 0.005494594573974609  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.031 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.145 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.949 s +65 processes with 1119 diagrams generated in 1.826 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.379 s -Wrote files for 810 helas calls in 3.511 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.285 s +Wrote files for 810 helas calls in 3.231 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.358 s +ALOHA: aloha creates 5 routines in 0.334 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.332 s +ALOHA: aloha creates 10 routines in 0.314 s VVV1 VVV1 FFV1 @@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.505s -user 0m8.862s -sys 0m0.556s +real 0m8.840s +user 0m8.307s +sys 0m0.499s Code generation completed in 9 seconds ************************************************************ * * diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' From eeee118949d113d082b130cbafff70fa2cccc04b Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 31 Jan 2024 13:15:18 +0200 Subject: [PATCH 94/96] [jt774] rerun all 78 tput tests on LUMI - same issues as before for gqttq (#806) (1) Step 1 - build on the login node (almost 24 hours!) STARTED AT Tue 30 Jan 2024 02:27:18 AM EET ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean -makeonly ENDED(1) AT Wed 31 Jan 2024 12:32:21 AM EET [Status=0] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean -makeonly ENDED(2) AT Wed 31 Jan 2024 01:01:06 AM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean -makeonly ENDED(3) AT Wed 31 Jan 2024 01:13:56 AM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst -makeonly ENDED(4) AT Wed 31 Jan 2024 01:16:06 AM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rorhst -makeonly ENDED(5) AT Wed 31 Jan 2024 01:18:44 AM EET [Status=0] (2) Step 2 - run tests on the worker node (less than 2 hours) NB this is "./tput/allTees.sh" WITHOUT the -hip flag (no "-rorhst" added) STARTED AT Wed 31 Jan 2024 01:16:39 PM EET ./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq -ggttggg -makeclean ENDED(1) AT Wed 31 Jan 2024 02:09:05 PM EET [Status=2] ./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ENDED(2) AT Wed 31 Jan 2024 02:26:12 PM EET [Status=0] ./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg -ggttggg -flt -bridge -makeclean ENDED(3) AT Wed 31 Jan 2024 02:45:10 PM EET [Status=2] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ENDED(4) AT Wed 31 Jan 2024 02:48:54 PM EET [Status=0] ./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ENDED(5) AT Wed 31 Jan 2024 02:51:15 PM EET [Status=0] ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt:ERROR! Fortran calculation (F77/CUDA) crashed ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt:Backtrace for this error: ./tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt:ERROR! Fortran calculation (F77/CUDA) crashed --- .../log_eemumu_mad_d_inl0_hrd0.txt | 227 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 234 ++++++--------- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 213 ++++++-------- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 210 +++++--------- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 229 ++++++--------- .../log_eemumu_mad_d_inl0_hrd1.txt | 227 ++++++--------- .../log_eemumu_mad_d_inl1_hrd0.txt | 225 ++++++-------- .../log_eemumu_mad_d_inl1_hrd1.txt | 225 ++++++-------- .../log_eemumu_mad_f_inl0_hrd0.txt | 239 +++++++-------- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 246 +++++++--------- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 227 ++++++--------- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 222 +++++--------- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 241 +++++++-------- .../log_eemumu_mad_f_inl0_hrd1.txt | 239 +++++++-------- .../log_eemumu_mad_f_inl1_hrd0.txt | 237 ++++++--------- .../log_eemumu_mad_f_inl1_hrd1.txt | 237 ++++++--------- .../log_eemumu_mad_m_inl0_hrd0.txt | 227 ++++++--------- .../log_eemumu_mad_m_inl0_hrd1.txt | 227 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0.txt | 227 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 234 ++++++--------- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 213 ++++++-------- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 210 +++++--------- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 229 ++++++--------- .../log_ggtt_mad_d_inl0_hrd1.txt | 227 ++++++--------- .../log_ggtt_mad_d_inl1_hrd0.txt | 225 ++++++-------- .../log_ggtt_mad_d_inl1_hrd1.txt | 225 ++++++-------- .../log_ggtt_mad_f_inl0_hrd0.txt | 245 +++++++--------- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 252 +++++++--------- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 239 +++++++-------- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 228 +++++---------- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 247 +++++++--------- .../log_ggtt_mad_f_inl0_hrd1.txt | 245 +++++++--------- .../log_ggtt_mad_f_inl1_hrd0.txt | 239 +++++++-------- .../log_ggtt_mad_f_inl1_hrd1.txt | 239 +++++++-------- .../log_ggtt_mad_m_inl0_hrd0.txt | 225 ++++++-------- .../log_ggtt_mad_m_inl0_hrd1.txt | 225 ++++++-------- .../log_ggttg_mad_d_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 258 +++++++---------- .../log_ggttg_mad_d_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttg_mad_f_inl0_hrd0.txt | 264 +++++++---------- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 272 +++++++---------- .../log_ggttg_mad_f_inl0_hrd1.txt | 264 +++++++---------- .../log_ggttg_mad_m_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttg_mad_m_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 258 +++++++---------- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 234 ++++++--------- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 228 +++++---------- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 253 +++++++--------- .../log_ggttgg_mad_d_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd0.txt | 252 +++++++--------- .../log_ggttgg_mad_d_inl1_hrd1.txt | 252 +++++++--------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 266 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 274 +++++++----------- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 258 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 244 ++++++---------- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 269 +++++++---------- .../log_ggttgg_mad_f_inl0_hrd1.txt | 266 +++++++---------- .../log_ggttgg_mad_f_inl1_hrd0.txt | 270 ++++++++--------- .../log_ggttgg_mad_f_inl1_hrd1.txt | 270 ++++++++--------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 246 +++++++--------- .../log_ggttgg_mad_m_inl0_hrd1.txt | 246 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 258 +++++++---------- .../log_ggttggg_mad_d_inl0_hrd1.txt | 250 +++++++--------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 266 +++++++---------- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 274 +++++++----------- .../log_ggttggg_mad_f_inl0_hrd1.txt | 266 +++++++---------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 250 +++++++--------- .../log_ggttggg_mad_m_inl0_hrd1.txt | 250 +++++++--------- .../log_gqttq_mad_d_inl0_hrd0.txt | 253 +++++----------- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 263 +++++------------ .../log_gqttq_mad_d_inl0_hrd1.txt | 253 +++++----------- .../log_gqttq_mad_f_inl0_hrd0.txt | 253 +++++----------- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 263 +++++------------ .../log_gqttq_mad_f_inl0_hrd1.txt | 253 +++++----------- .../log_gqttq_mad_m_inl0_hrd0.txt | 253 +++++----------- .../log_gqttq_mad_m_inl0_hrd1.txt | 253 +++++----------- 78 files changed, 7390 insertions(+), 11668 deletions(-) diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 15dbd5f8d1..d8bb554a39 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_04:51:46 +DATE: 2024-01-31_13:48:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.572573e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.281942e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.116391e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.839714 sec - 2,719,217,340 cycles # 2.832 GHz - 4,277,615,433 instructions # 1.57 insn per cycle - 1.175143775 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.295982e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.113400e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.341424e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.999466 sec + 15,384,344,417 cycles:u # 2.941 GHz (74.97%) + 53,752,468 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.02%) + 6,944,996,178 stalled-cycles-backend:u # 45.14% backend cycles idle (75.06%) + 11,608,343,150 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (74.87%) + 5.541765598 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.879157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.147243e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.147243e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.790847 sec - 19,539,640,504 cycles # 2.876 GHz - 46,935,351,432 instructions # 2.40 insn per cycle - 6.804517518 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.249676e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.429139e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.429139e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.771162 sec + 19,518,544,594 cycles:u # 3.364 GHz (74.91%) + 49,953,673 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.97%) + 62,395,027 stalled-cycles-backend:u # 0.32% backend cycles idle (75.04%) + 46,990,078,774 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 5.805858662 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.545376e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.021398e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.021398e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.488904 sec - 12,869,370,410 cycles # 2.864 GHz - 31,186,180,279 instructions # 2.42 insn per cycle - 4.505888529 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.926426e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.430773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.430773e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.000237 sec + 13,293,955,872 cycles:u # 3.297 GHz (75.00%) + 49,498,965 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.00%) + 995,469,090 stalled-cycles-backend:u # 7.49% backend cycles idle (75.01%) + 31,161,260,421 instructions:u # 2.34 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 4.036423376 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.955981e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.735873e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.735873e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.633222 sec - 10,032,348,170 cycles # 2.758 GHz - 19,481,701,848 instructions # 1.94 insn per cycle - 3.651370321 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.653277e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528834e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528834e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.105945 sec + 10,166,660,682 cycles:u # 3.240 GHz (74.93%) + 48,671,590 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) + 439,852,148 stalled-cycles-backend:u # 4.33% backend cycles idle (75.02%) + 19,408,273,106 instructions:u # 1.91 insn per cycle + # 0.02 stalled cycles per insn (75.02%) + 3.142037322 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.070263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.978600e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.978600e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.453661 sec - 9,572,367,477 cycles # 2.767 GHz - 18,943,715,958 instructions # 1.98 insn per cycle - 3.473553059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.819162e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.469996e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.469996e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.879359 sec - 8,193,098,191 cycles # 2.110 GHz - 15,513,331,501 instructions # 1.89 insn per cycle - 3.898953032 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index f78ea7251e..eca66f0c00 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:45:26 +DATE: 2024-01-31_14:38:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.460171e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485962e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.485962e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.319187 sec - 7,341,770,811 cycles # 2.857 GHz - 13,101,723,847 instructions # 1.78 insn per cycle - 2.628471382 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.488604e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.339655e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.339655e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.547931 sec + 18,342,913,114 cycles:u # 3.284 GHz (74.95%) + 120,415,042 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.96%) + 6,982,028,143 stalled-cycles-backend:u # 38.06% backend cycles idle (75.02%) + 17,140,288,864 instructions:u # 0.93 insn per cycle + # 0.41 stalled cycles per insn (75.08%) + 5.611856863 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.576223e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.107198e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107198e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 7.190455 sec - 20,703,597,440 cycles # 2.877 GHz - 47,160,901,733 instructions # 2.28 insn per cycle - 7.198222207 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.233619e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.408095e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.408095e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.950601 sec + 19,929,648,484 cycles:u # 3.325 GHz (74.91%) + 51,037,859 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.95%) + 111,782,691 stalled-cycles-backend:u # 0.56% backend cycles idle (75.01%) + 47,317,725,455 instructions:u # 2.37 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 5.995864091 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.473769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.897978e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.897978e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.898106 sec - 14,084,591,919 cycles # 2.873 GHz - 32,028,151,491 instructions # 2.27 insn per cycle - 4.906157596 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.869740e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.339491e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.339491e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.242404 sec + 13,932,338,033 cycles:u # 3.251 GHz (75.02%) + 51,648,161 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.00%) + 1,023,128,866 stalled-cycles-backend:u # 7.34% backend cycles idle (74.99%) + 31,992,864,934 instructions:u # 2.30 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 4.289164967 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.834615e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.502061e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.502061e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.065584 sec - 11,264,443,170 cycles # 2.767 GHz - 20,844,723,129 instructions # 1.85 insn per cycle - 4.073296839 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.545589e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.346989e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.346989e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.357207 sec + 10,846,967,886 cycles:u # 3.190 GHz (74.86%) + 50,608,240 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.97%) + 521,276,562 stalled-cycles-backend:u # 4.81% backend cycles idle (75.07%) + 20,691,777,146 instructions:u # 1.91 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 3.403927192 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.930005e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.695920e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.695920e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.900573 sec - 10,821,072,419 cycles # 2.771 GHz - 20,305,054,668 instructions # 1.88 insn per cycle - 3.908355042 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.707724e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.274502e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.274502e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.333313 sec - 9,497,951,325 cycles # 2.189 GHz - 16,666,820,850 instructions # 1.75 insn per cycle - 4.341233179 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index f072467bfa..c2faab2d60 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:59:18 +DATE: 2024-01-31_14:52:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.483909e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.562012e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.071690e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.261681e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.110113e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.338630e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.371489 sec - 4,620,404,364 cycles # 2.861 GHz - 7,153,271,516 instructions # 1.55 insn per cycle - 1.672602435 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 4.688984 sec + 15,405,560,202 cycles:u # 3.276 GHz (74.94%) + 53,825,141 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.99%) + 6,937,223,184 stalled-cycles-backend:u # 45.03% backend cycles idle (75.00%) + 11,638,228,833 instructions:u # 0.76 insn per cycle + # 0.60 stalled cycles per insn (75.00%) + 4.746802261 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.952512e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155636e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155636e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.249339e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.428218e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.428218e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 7.107803 sec - 20,592,800,911 cycles # 2.895 GHz - 47,037,031,319 instructions # 2.28 insn per cycle - 7.114495241 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.773449 sec + 19,539,350,931 cycles:u # 3.366 GHz (74.92%) + 50,052,041 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.97%) + 54,884,087 stalled-cycles-backend:u # 0.28% backend cycles idle (75.04%) + 46,993,014,397 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 5.807755800 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.558277e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.038534e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.038534e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.921355e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.420017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.420017e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.822482 sec - 13,870,774,877 cycles # 2.874 GHz - 31,186,249,487 instructions # 2.25 insn per cycle - 4.828845646 seconds time elapsed +TOTAL : 4.007344 sec + 13,356,890,506 cycles:u # 3.307 GHz (74.88%) + 49,517,641 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.98%) + 1,103,447,662 stalled-cycles-backend:u # 8.26% backend cycles idle (75.04%) + 31,130,342,637 instructions:u # 2.33 insn per cycle + # 0.04 stalled cycles per insn (75.05%) + 4.041464440 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.951724e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.730389e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.730389e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.656218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.533268e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.533268e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.015384 sec - 11,119,337,735 cycles # 2.766 GHz - 19,381,852,554 instructions # 1.74 insn per cycle - 4.022009475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +TOTAL : 3.104627 sec + 10,149,444,998 cycles:u # 3.236 GHz (75.01%) + 48,969,969 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.01%) + 452,522,891 stalled-cycles-backend:u # 4.46% backend cycles idle (75.01%) + 19,371,433,086 instructions:u # 1.91 insn per cycle + # 0.02 stalled cycles per insn (75.03%) + 3.138941761 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.063314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.951443e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951443e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.845408 sec - 10,662,597,452 cycles # 2.769 GHz - 18,643,141,459 instructions # 1.75 insn per cycle - 3.852109381 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.811483e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.460421e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.460421e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.272939 sec - 9,279,488,955 cycles # 2.169 GHz - 15,212,537,826 instructions # 1.64 insn per cycle - 4.279485071 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index a6db5de426..280278479d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:55:54 +DATE: 2024-01-31_14:50:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.492089e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.565509e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.085712e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.999229 sec - 3,503,665,967 cycles # 2.851 GHz - 7,040,796,455 instructions # 2.01 insn per cycle - 1.289089254 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 53,192,877 cycles:u # 2.433 GHz (63.44%) + 38,671 stalled-cycles-frontend:u # 0.07% frontend cycles idle (63.44%) + 642,394 stalled-cycles-backend:u # 1.21% backend cycles idle (63.44%) + 41,176,822 instructions:u # 0.77 insn per cycle + # 0.02 stalled cycles per insn (65.67%) + 0.022859651 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.897604e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.152411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.152411e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.777699 sec - 19,525,012,140 cycles # 2.879 GHz - 46,935,602,227 instructions # 2.40 insn per cycle - 6.784496054 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted + 56,283,909 cycles:u # 2.613 GHz (62.89%) + 41,850 stalled-cycles-frontend:u # 0.07% frontend cycles idle (62.90%) + 601,698 stalled-cycles-backend:u # 1.07% backend cycles idle (62.90%) + 42,399,243 instructions:u # 0.75 insn per cycle + # 0.01 stalled cycles per insn (59.04%) + 0.022848500 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.565929e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.046315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.046315e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.431371 sec - 12,844,580,525 cycles # 2.895 GHz - 31,183,505,413 instructions # 2.43 insn per cycle - 4.438022505 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted + 56,518,310 cycles:u # 2.612 GHz (63.07%) + 43,325 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.07%) + 585,678 stalled-cycles-backend:u # 1.04% backend cycles idle (63.07%) + 42,553,804 instructions:u # 0.75 insn per cycle + # 0.01 stalled cycles per insn (58.39%) + 0.023052274 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.956069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.738681e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.738681e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.632695 sec - 10,040,197,478 cycles # 2.761 GHz - 19,480,754,402 instructions # 1.94 insn per cycle - 3.639336589 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted + 51,166,849 cycles:u # 2.361 GHz (63.11%) + 45,358 stalled-cycles-frontend:u # 0.09% frontend cycles idle (63.11%) + 595,939 stalled-cycles-backend:u # 1.16% backend cycles idle (63.11%) + 43,196,692 instructions:u # 0.84 insn per cycle + # 0.01 stalled cycles per insn (64.81%) + 0.023009761 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.068909e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.973543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.973543e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.456026 sec - 9,583,252,780 cycles # 2.770 GHz - 18,943,299,087 instructions # 1.98 insn per cycle - 3.462550493 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.820163e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.473451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.473451e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.874228 sec - 8,184,248,497 cycles # 2.110 GHz - 15,512,168,002 instructions # 1.90 insn per cycle - 3.880483923 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 4dded3e862..716313b078 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:52:26 +DATE: 2024-01-31_14:46:17 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.831383e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.529080e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.990768e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.936415 sec - 6,196,996,673 cycles # 2.858 GHz - 11,355,646,527 instructions # 1.83 insn per cycle - 2.226164304 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.521065e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.087989e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.316527e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.382298 sec + 17,842,680,178 cycles:u # 3.292 GHz (75.07%) + 118,997,598 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.07%) + 6,884,127,997 stalled-cycles-backend:u # 38.58% backend cycles idle (75.06%) + 16,790,757,485 instructions:u # 0.94 insn per cycle + # 0.41 stalled cycles per insn (75.03%) + 5.439280481 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.923680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.152570e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.152570e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.755774 sec - 19,508,468,124 cycles # 2.886 GHz - 46,934,079,079 instructions # 2.41 insn per cycle - 6.762162730 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.250417e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.429575e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.429575e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.768884 sec + 19,515,269,263 cycles:u # 3.365 GHz (74.95%) + 50,331,013 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.02%) + 60,390,410 stalled-cycles-backend:u # 0.31% backend cycles idle (75.04%) + 47,000,896,839 instructions:u # 2.41 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.803374749 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.560350e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.041132e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.041132e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.445978 sec - 12,824,682,223 cycles # 2.881 GHz - 31,183,984,467 instructions # 2.43 insn per cycle - 4.452647644 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.919522e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.420724e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.420724e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.012332 sec + 13,381,132,630 cycles:u # 3.309 GHz (74.88%) + 49,972,164 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.89%) + 1,038,416,264 stalled-cycles-backend:u # 7.76% backend cycles idle (74.99%) + 31,108,050,469 instructions:u # 2.32 insn per cycle + # 0.03 stalled cycles per insn (75.08%) + 4.046628929 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.945035e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.719021e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.719021e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.651562 sec - 10,054,417,482 cycles # 2.750 GHz - 19,480,651,159 instructions # 1.94 insn per cycle - 3.658175830 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.656353e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.534875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534875e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.103048 sec + 10,154,943,843 cycles:u # 3.239 GHz (74.99%) + 49,102,982 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.99%) + 434,988,979 stalled-cycles-backend:u # 4.28% backend cycles idle (74.99%) + 19,375,707,710 instructions:u # 1.91 insn per cycle + # 0.02 stalled cycles per insn (75.02%) + 3.137538734 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.065244e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.964476e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.964476e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.463334 sec - 9,575,609,591 cycles # 2.761 GHz - 18,944,249,093 instructions # 1.98 insn per cycle - 3.469928809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.819790e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.476564e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.476564e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.875473 sec - 8,194,000,405 cycles # 2.112 GHz - 15,512,267,676 instructions # 1.89 insn per cycle - 3.882168596 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 9238de7bbb..78355813e9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_04:52:22 +DATE: 2024-01-31_13:48:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.433269e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.304294e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.211626e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.708580 sec - 2,678,035,833 cycles # 2.828 GHz - 4,219,258,618 instructions # 1.58 insn per cycle - 1.025396427 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.593715e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.569869e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.888804e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.652863 sec + 15,320,835,933 cycles:u # 3.274 GHz (74.95%) + 53,752,772 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.99%) + 6,932,962,718 stalled-cycles-backend:u # 45.25% backend cycles idle (75.06%) + 11,508,742,478 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (75.06%) + 4.711242016 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.057712e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.240764e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.240764e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.363915 sec - 18,420,155,453 cycles # 2.892 GHz - 44,716,833,361 instructions # 2.43 insn per cycle - 6.376789264 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.320903e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.522950e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.522950e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.497632 sec + 18,560,081,042 cycles:u # 3.357 GHz (74.97%) + 51,404,905 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.98%) + 64,148,698 stalled-cycles-backend:u # 0.35% backend cycles idle (74.98%) + 44,859,448,593 instructions:u # 2.42 insn per cycle + # 0.00 stalled cycles per insn (74.90%) + 5.532302089 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.624136e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.147437e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.147437e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.286124 sec - 12,429,118,549 cycles # 2.897 GHz - 30,107,231,858 instructions # 2.42 insn per cycle - 4.302706533 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.012252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.561147e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.561147e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.861847 sec + 12,799,524,672 cycles:u # 3.287 GHz (74.95%) + 48,923,305 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.95%) + 107,403,190 stalled-cycles-backend:u # 0.84% backend cycles idle (74.94%) + 30,132,025,093 instructions:u # 2.35 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 3.898000257 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.942189e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.705004e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.705004e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.656079 sec - 10,127,428,804 cycles # 2.766 GHz - 19,115,519,637 instructions # 1.89 insn per cycle - 3.673885868 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.591787e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.432648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.432648e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.163229 sec + 10,369,794,084 cycles:u # 3.245 GHz (74.99%) + 49,953,629 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.97%) + 285,363,281 stalled-cycles-backend:u # 2.75% backend cycles idle (74.98%) + 19,016,184,956 instructions:u # 1.83 insn per cycle + # 0.02 stalled cycles per insn (74.98%) + 3.199312894 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.094903e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.039710e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.039710e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.417483 sec - 9,477,381,758 cycles # 2.768 GHz - 18,489,351,216 instructions # 1.95 insn per cycle - 3.434681568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183418e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.193735e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.193735e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.298580 sec - 7,210,521,695 cycles # 2.182 GHz - 13,864,693,183 instructions # 1.92 insn per cycle - 3.315590461 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 09e3552971..f09c7ac494 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:33:56 +DATE: 2024-01-31_14:19:13 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.454720e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.590982e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.126095e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.682889 sec - 2,611,559,388 cycles # 2.831 GHz - 3,986,840,129 instructions # 1.53 insn per cycle - 0.986209294 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.293150e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.104547e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.333336e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.667156 sec + 15,394,349,846 cycles:u # 3.274 GHz (75.07%) + 53,848,125 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.95%) + 6,942,774,491 stalled-cycles-backend:u # 45.10% backend cycles idle (74.94%) + 11,571,980,671 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (74.99%) + 4.724485567 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165201E-002 -Relative difference = 1.0277080522138477e-08 +Avg ME (F77/CUDA) = 1.2828039868165208E-002 +Relative difference = 1.0277079981222336e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.350945e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.669369e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.669369e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.075739 sec - 14,632,134,397 cycles # 2.880 GHz - 36,697,212,873 instructions # 2.51 insn per cycle - 5.082665504 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.776160e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.164284e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.164284e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.280355 sec + 14,197,543,075 cycles:u # 3.292 GHz (74.96%) + 45,822,161 stalled-cycles-frontend:u # 0.32% frontend cycles idle (74.97%) + 534,603,013 stalled-cycles-backend:u # 3.77% backend cycles idle (74.96%) + 36,933,424,641 instructions:u # 2.60 insn per cycle + # 0.01 stalled cycles per insn (74.96%) + 4.314915144 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.975416e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.812212e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.812212e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.599579 sec - 10,391,716,980 cycles # 2.883 GHz - 24,753,509,930 instructions # 2.38 insn per cycle - 3.606361950 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.400347e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.238738e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.238738e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.352610 sec + 11,041,411,662 cycles:u # 3.262 GHz (74.85%) + 49,430,935 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.85%) + 65,289,997 stalled-cycles-backend:u # 0.59% backend cycles idle (74.95%) + 24,715,980,656 instructions:u # 2.24 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.388700835 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.206864e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.274609e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.274609e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.273737 sec - 8,884,033,270 cycles # 2.722 GHz - 16,960,441,009 instructions # 1.91 insn per cycle - 3.280558312 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.998612e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.177017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.177017e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.834155 sec + 9,225,543,059 cycles:u # 3.218 GHz (74.93%) + 49,920,264 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.89%) + 523,058,244 stalled-cycles-backend:u # 5.67% backend cycles idle (74.88%) + 16,856,415,715 instructions:u # 1.83 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 2.870336452 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.436675e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.780065e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.780065e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.997375 sec - 8,315,936,313 cycles # 2.769 GHz - 16,298,181,743 instructions # 1.96 insn per cycle - 3.004046425 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.987391e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.794180e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.794180e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.583817 sec - 7,670,874,044 cycles # 2.137 GHz - 14,352,448,248 instructions # 1.87 insn per cycle - 3.590538974 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165090E-002 -Relative difference = 1.0277089176796747e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 508008a0c5..25c71260e9 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:34:28 +DATE: 2024-01-31_14:19:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.464301e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.594213e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.177261e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.680513 sec - 2,594,214,158 cycles # 2.833 GHz - 3,992,420,158 instructions # 1.54 insn per cycle - 0.978034885 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.862243e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.571665e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.890778e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.656807 sec + 15,360,030,989 cycles:u # 3.275 GHz (74.94%) + 53,784,472 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.92%) + 6,957,453,969 stalled-cycles-backend:u # 45.30% backend cycles idle (74.93%) + 11,504,128,443 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (74.93%) + 4.716015553 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165206E-002 -Relative difference = 1.027708011645137e-08 +Avg ME (F77/CUDA) = 1.2828039868165216E-002 +Relative difference = 1.0277079305077159e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.895468e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.581482e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.581482e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.738704 sec - 10,794,188,443 cycles # 2.885 GHz - 28,356,720,092 instructions # 2.63 insn per cycle - 3.745371478 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.430798e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.218429e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.218429e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.325483 sec + 10,899,459,723 cycles:u # 3.247 GHz (74.98%) + 51,006,099 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.00%) + 55,681,565 stalled-cycles-backend:u # 0.51% backend cycles idle (74.88%) + 28,438,633,768 instructions:u # 2.61 insn per cycle + # 0.00 stalled cycles per insn (74.86%) + 3.359880853 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.231818e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.360148e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.360148e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.232648 sec - 9,331,358,518 cycles # 2.882 GHz - 21,587,159,141 instructions # 2.31 insn per cycle - 3.239331570 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.619196e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.647337e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.647337e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.139951 sec + 10,274,909,594 cycles:u # 3.238 GHz (75.04%) + 49,997,017 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.04%) + 73,156,824 stalled-cycles-backend:u # 0.71% backend cycles idle (75.04%) + 21,493,261,529 instructions:u # 2.09 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 3.176692165 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.406271e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.696326e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.696326e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.030114 sec - 8,381,289,955 cycles # 2.761 GHz - 15,943,872,727 instructions # 1.90 insn per cycle - 3.036686774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.291532e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.758064e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.758064e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 2.645509 sec + 8,558,488,971 cycles:u # 3.196 GHz (74.94%) + 49,337,655 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.93%) + 143,499,117 stalled-cycles-backend:u # 1.68% backend cycles idle (74.93%) + 15,863,646,395 instructions:u # 1.85 insn per cycle + # 0.01 stalled cycles per insn (74.91%) + 2.681607729 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.611770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.211566e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.211566e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.823652 sec - 7,834,743,570 cycles # 2.770 GHz - 15,370,444,400 instructions # 1.96 insn per cycle - 2.830226684 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.110110e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.044152e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.044152e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.399029 sec - 7,342,854,469 cycles # 2.157 GHz - 13,880,932,107 instructions # 1.89 insn per cycle - 3.405583219 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165093E-002 -Relative difference = 1.0277088906338675e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 30054d0a8f..9d85c8125b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_04:52:57 +DATE: 2024-01-31_13:49:03 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.089125e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.083340e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.291553e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.592260 sec - 2,336,196,912 cycles # 2.833 GHz - 3,633,132,034 instructions # 1.56 insn per cycle - 0.902800684 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.822503e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.886194e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.564933e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.540991 sec + 14,964,746,430 cycles:u # 3.275 GHz (75.08%) + 53,367,635 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.02%) + 6,902,312,333 stalled-cycles-backend:u # 46.12% backend cycles idle (74.96%) + 11,491,236,018 instructions:u # 0.77 insn per cycle + # 0.60 stalled cycles per insn (74.95%) + 4.596843421 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.035118e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.220346e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.220346e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.458158 sec - 18,623,778,658 cycles # 2.882 GHz - 47,047,597,520 instructions # 2.53 insn per cycle - 6.468376899 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.418440e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645579e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645579e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.112835 sec + 17,308,897,976 cycles:u # 3.368 GHz (74.94%) + 39,897,530 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.95%) + 35,730,790 stalled-cycles-backend:u # 0.21% backend cycles idle (74.95%) + 47,288,452,597 instructions:u # 2.73 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 5.142750348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.220597e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.402817e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.402817e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.207438 sec - 9,259,856,985 cycles # 2.882 GHz - 22,093,069,841 instructions # 2.39 insn per cycle - 3.223491423 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.920622e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.133455e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.133455e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.831499 sec + 9,329,735,116 cycles:u # 3.263 GHz (74.87%) + 41,446,131 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.84%) + 634,040,066 stalled-cycles-backend:u # 6.80% backend cycles idle (74.96%) + 22,145,927,626 instructions:u # 2.37 insn per cycle + # 0.03 stalled cycles per insn (75.09%) + 2.863019748 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.440699e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.781387e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.781387e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.957121 sec - 8,193,990,799 cycles # 2.766 GHz - 15,625,791,555 instructions # 1.91 insn per cycle - 2.973833384 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.417312e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.003968e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.003968e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.517278 sec + 8,221,686,635 cycles:u # 3.231 GHz (74.73%) + 42,237,080 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.72%) + 1,468,343,547 stalled-cycles-backend:u # 17.86% backend cycles idle (75.01%) + 15,538,735,402 instructions:u # 1.89 insn per cycle + # 0.09 stalled cycles per insn (75.17%) + 2.548646629 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.532783e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.026282e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.026282e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.864857 sec - 7,877,312,491 cycles # 2.746 GHz - 15,298,553,606 instructions # 1.94 insn per cycle - 2.880238416 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.515538e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.925634e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.925634e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.878384 sec - 6,411,016,127 cycles # 2.223 GHz - 12,624,518,195 instructions # 1.97 insn per cycle - 2.897065980 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index cb0960cef7..1e7b5259fb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:46:05 +DATE: 2024-01-31_14:39:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.896245e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.389243e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.389243e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.734031 sec - 5,668,072,364 cycles # 2.868 GHz - 10,146,395,921 instructions # 1.79 insn per cycle - 2.033339529 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.591289e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.292543e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.292543e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.354674 sec + 17,801,711,429 cycles:u # 3.305 GHz (74.96%) + 119,136,937 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.96%) + 6,965,849,221 stalled-cycles-backend:u # 39.13% backend cycles idle (74.94%) + 17,030,804,835 instructions:u # 0.96 insn per cycle + # 0.41 stalled cycles per insn (74.98%) + 5.411875647 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.023723e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.199962e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.199962e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.631109 sec - 19,198,970,802 cycles # 2.893 GHz - 47,195,604,267 instructions # 2.46 insn per cycle - 6.638520301 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.406552e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.630049e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.630049e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.208125 sec + 17,519,950,376 cycles:u # 3.343 GHz (74.98%) + 39,726,529 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.91%) + 62,585,991 stalled-cycles-backend:u # 0.36% backend cycles idle (74.90%) + 47,398,710,400 instructions:u # 2.71 insn per cycle + # 0.00 stalled cycles per insn (74.97%) + 5.243207445 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.130711e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.183569e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.183569e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.452422 sec - 9,989,387,225 cycles # 2.889 GHz - 23,431,077,272 instructions # 2.35 insn per cycle - 3.459894158 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.841478e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.976405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.976405e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.979950 sec + 9,712,189,466 cycles:u # 3.223 GHz (75.00%) + 42,454,538 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.05%) + 673,634,001 stalled-cycles-backend:u # 6.94% backend cycles idle (75.05%) + 23,422,171,384 instructions:u # 2.41 insn per cycle + # 0.03 stalled cycles per insn (75.05%) + 3.016536490 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.341081e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.547294e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.547294e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.196012 sec - 8,906,176,925 cycles # 2.782 GHz - 16,751,991,837 instructions # 1.88 insn per cycle - 3.203321936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 3.323002e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.808846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.808846e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.650715 sec + 8,541,815,755 cycles:u # 3.182 GHz (75.00%) + 42,861,880 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.97%) + 1,469,731,134 stalled-cycles-backend:u # 17.21% backend cycles idle (74.99%) + 16,642,700,791 instructions:u # 1.95 insn per cycle + # 0.09 stalled cycles per insn (74.99%) + 2.687678893 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.434021e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.786427e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.786427e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.093664 sec - 8,635,370,178 cycles # 2.786 GHz - 16,424,138,356 instructions # 1.90 insn per cycle - 3.101132741 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.383314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.611676e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.611676e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.145258 sec - 7,151,980,153 cycles # 2.270 GHz - 13,850,467,115 instructions # 1.94 insn per cycle - 3.152590479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 26c818590d..f932e39c83 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:59:57 +DATE: 2024-01-31_14:52:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.303596e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175288e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.243996e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.207260 sec - 4,082,591,214 cycles # 2.858 GHz - 6,515,356,659 instructions # 1.60 insn per cycle - 1.486873600 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.826922e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.880208e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.555099e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.540773 sec + 14,945,912,811 cycles:u # 3.273 GHz (75.06%) + 53,537,566 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.99%) + 14,415,846 stalled-cycles-backend:u # 0.10% backend cycles idle (74.90%) + 11,535,467,523 instructions:u # 0.77 insn per cycle + # 0.00 stalled cycles per insn (74.91%) + 4.590921512 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.039099e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.222240e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.222240e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.416115e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645493e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645493e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.777770 sec - 19,569,392,860 cycles # 2.885 GHz - 47,229,099,277 instructions # 2.41 insn per cycle - 6.784024049 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.118619 sec + 17,328,202,172 cycles:u # 3.368 GHz (74.97%) + 39,972,803 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) + 37,213,488 stalled-cycles-backend:u # 0.21% backend cycles idle (74.97%) + 47,246,133,917 instructions:u # 2.73 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 5.148163373 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.224011e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.394362e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.394362e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.922683e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.133650e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.133650e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.543713 sec - 10,250,573,649 cycles # 2.890 GHz - 22,173,775,935 instructions # 2.16 insn per cycle - 3.550219999 seconds time elapsed +TOTAL : 2.826629 sec + 9,320,790,539 cycles:u # 3.266 GHz (74.87%) + 41,188,075 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.01%) + 645,773,111 stalled-cycles-backend:u # 6.93% backend cycles idle (75.05%) + 22,128,687,314 instructions:u # 2.37 insn per cycle + # 0.03 stalled cycles per insn (75.06%) + 2.856154656 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.458663e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.813529e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.813529e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.412141e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.996034e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.996034e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.280080 sec - 9,161,776,432 cycles # 2.789 GHz - 15,536,168,479 instructions # 1.70 insn per cycle - 3.286291256 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +TOTAL : 2.520340 sec + 8,200,553,276 cycles:u # 3.219 GHz (74.88%) + 41,966,268 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.90%) + 1,471,137,468 stalled-cycles-backend:u # 17.94% backend cycles idle (74.90%) + 15,573,562,618 instructions:u # 1.90 insn per cycle + # 0.09 stalled cycles per insn (74.96%) + 2.549956102 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.554649e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.077981e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.077981e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.189150 sec - 8,891,496,493 cycles # 2.784 GHz - 15,006,164,122 instructions # 1.69 insn per cycle - 3.195486341 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.516232e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.934012e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.934012e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.229540 sec - 7,432,998,054 cycles # 2.298 GHz - 12,333,053,960 instructions # 1.66 insn per cycle - 3.235962697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 90d7f62db4..1c74b1aeb4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:56:29 +DATE: 2024-01-31_14:50:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.305141e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181296e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.274552e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.867173 sec - 3,085,877,327 cycles # 2.830 GHz - 6,333,420,740 instructions # 2.05 insn per cycle - 1.147827940 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 51,917,821 cycles:u # 2.385 GHz (63.27%) + 38,761 stalled-cycles-frontend:u # 0.07% frontend cycles idle (63.27%) + 606,072 stalled-cycles-backend:u # 1.17% backend cycles idle (63.27%) + 42,794,904 instructions:u # 0.82 insn per cycle + # 0.01 stalled cycles per insn (65.21%) + 0.022695324 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.039832e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.222763e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.222763e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.429819 sec - 18,561,263,651 cycles # 2.885 GHz - 47,048,334,209 instructions # 2.53 insn per cycle - 6.436326918 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted + 56,020,284 cycles:u # 2.606 GHz (62.82%) + 42,671 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.82%) + 614,725 stalled-cycles-backend:u # 1.10% backend cycles idle (62.82%) + 39,130,829 instructions:u # 0.70 insn per cycle + # 0.02 stalled cycles per insn (64.46%) + 0.022799927 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.222730e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.393980e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.393980e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.203768 sec - 9,238,443,218 cycles # 2.879 GHz - 22,092,244,938 instructions # 2.39 insn per cycle - 3.210105048 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted + 52,602,539 cycles:u # 2.430 GHz (63.07%) + 44,114 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.07%) + 591,873 stalled-cycles-backend:u # 1.13% backend cycles idle (63.07%) + 41,944,859 instructions:u # 0.80 insn per cycle + # 0.01 stalled cycles per insn (64.69%) + 0.022939957 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.418509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.733909e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.733909e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.982066 sec - 8,185,679,734 cycles # 2.740 GHz - 15,625,107,028 instructions # 1.91 insn per cycle - 2.988278371 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted + 56,603,806 cycles:u # 2.608 GHz (63.17%) + 34,889 stalled-cycles-frontend:u # 0.06% frontend cycles idle (63.17%) + 570,698 stalled-cycles-backend:u # 1.01% backend cycles idle (63.17%) + 42,471,825 instructions:u # 0.75 insn per cycle + # 0.01 stalled cycles per insn (58.35%) + 0.023997783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.558846e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.085053e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.085053e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.831486 sec - 7,894,514,850 cycles # 2.783 GHz - 15,296,644,493 instructions # 1.94 insn per cycle - 2.837958999 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.525394e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.942507e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.942507e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.867904 sec - 6,407,267,092 cycles # 2.230 GHz - 12,623,570,741 instructions # 1.97 insn per cycle - 2.874115235 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 91671fa84d..dd80ca1417 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:53:04 +DATE: 2024-01-31_14:46:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.674927e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.142204e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.126513e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.522177 sec - 5,014,296,377 cycles # 2.858 GHz - 9,135,258,914 instructions # 1.82 insn per cycle - 1.813578794 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.353204e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.699417e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.359789e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 +TOTAL : 5.254917 sec + 17,475,736,169 cycles:u # 3.306 GHz (75.03%) + 118,130,291 stalled-cycles-frontend:u # 0.68% frontend cycles idle (75.05%) + 6,884,676,285 stalled-cycles-backend:u # 39.40% backend cycles idle (75.05%) + 16,715,227,456 instructions:u # 0.96 insn per cycle + # 0.41 stalled cycles per insn (75.04%) + 5.304271267 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.043183e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.226572e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.226572e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.408092 sec - 18,567,709,150 cycles # 2.896 GHz - 47,047,255,730 instructions # 2.53 insn per cycle - 6.414419955 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.415589e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.645535e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.645535e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 5.119285 sec + 17,331,666,900 cycles:u # 3.368 GHz (74.97%) + 40,669,852 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) + 36,759,472 stalled-cycles-backend:u # 0.21% backend cycles idle (74.97%) + 47,239,596,291 instructions:u # 2.73 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 5.148875716 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.231919e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.414648e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.414648e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.191678 sec - 9,246,166,536 cycles # 2.894 GHz - 22,093,449,321 instructions # 2.39 insn per cycle - 3.197919261 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.925021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.135088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.135088e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.827945 sec + 9,319,315,337 cycles:u # 3.264 GHz (74.84%) + 41,387,005 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.98%) + 643,492,388 stalled-cycles-backend:u # 6.90% backend cycles idle (75.06%) + 22,144,122,734 instructions:u # 2.38 insn per cycle + # 0.03 stalled cycles per insn (75.07%) + 2.857459790 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.455778e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.806689e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.806689e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.938294 sec - 8,179,243,825 cycles # 2.779 GHz - 15,624,915,954 instructions # 1.91 insn per cycle - 2.944456642 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.418028e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.008897e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.008897e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.515058 sec + 8,209,426,221 cycles:u # 3.230 GHz (74.85%) + 42,192,838 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.85%) + 1,461,738,597 stalled-cycles-backend:u # 17.81% backend cycles idle (74.92%) + 15,535,734,341 instructions:u # 1.89 insn per cycle + # 0.09 stalled cycles per insn (75.08%) + 2.544365775 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.562111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.082808e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.082808e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.828979 sec - 7,880,998,863 cycles # 2.781 GHz - 15,296,291,599 instructions # 1.94 insn per cycle - 2.835269816 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.528595e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.951135e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.951135e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.864434 sec - 6,402,503,393 cycles # 2.232 GHz - 12,623,594,501 instructions # 1.97 insn per cycle - 2.870718249 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052589611616E-002 -Relative difference = 2.0187102602673518e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index cc5700bb60..e59c139d2f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_04:53:28 +DATE: 2024-01-31_13:49:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.091291e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.093645e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.338052e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585723 sec - 2,310,991,948 cycles # 2.835 GHz - 3,567,792,024 instructions # 1.54 insn per cycle - 0.889438316 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.837530e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.912256e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.600050e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.542847 sec + 14,980,531,541 cycles:u # 3.276 GHz (75.06%) + 53,460,149 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.98%) + 6,912,146,752 stalled-cycles-backend:u # 46.14% backend cycles idle (74.93%) + 11,476,019,042 instructions:u # 0.77 insn per cycle + # 0.60 stalled cycles per insn (74.92%) + 4.596974255 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.092050e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.295990e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.295990e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.138113 sec - 17,749,278,373 cycles # 2.890 GHz - 43,890,075,557 instructions # 2.47 insn per cycle - 6.149965364 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.541888e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.813728e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.813728e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 4.755661 sec + 16,087,215,170 cycles:u # 3.364 GHz (74.84%) + 39,830,326 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.85%) + 34,401,704 stalled-cycles-backend:u # 0.21% backend cycles idle (75.00%) + 44,045,394,048 instructions:u # 2.74 insn per cycle + # 0.00 stalled cycles per insn (75.08%) + 4.785452751 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.281832e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.528866e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.528866e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.131918 sec - 9,063,997,030 cycles # 2.890 GHz - 21,583,444,087 instructions # 2.38 insn per cycle - 3.172631085 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.021652e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.329118e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.329118e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.759658 sec + 9,064,951,892 cycles:u # 3.252 GHz (75.04%) + 42,419,061 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.03%) + 124,707,018 stalled-cycles-backend:u # 1.38% backend cycles idle (75.03%) + 21,624,549,669 instructions:u # 2.39 insn per cycle + # 0.01 stalled cycles per insn (75.03%) + 2.790596878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.471429e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.850830e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850830e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.922404 sec - 8,130,490,307 cycles # 2.776 GHz - 15,429,884,484 instructions # 1.90 insn per cycle - 2.941222784 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.468848e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.115626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.115626e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.491231 sec + 8,118,328,833 cycles:u # 3.223 GHz (74.94%) + 42,223,386 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.91%) + 1,786,726,359 stalled-cycles-backend:u # 22.01% backend cycles idle (74.78%) + 15,402,271,138 instructions:u # 1.90 insn per cycle + # 0.12 stalled cycles per insn (74.80%) + 2.522582463 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 +Avg ME (F77/C++) = 1.2828053369958070E-002 +Relative difference = 2.627022867500074e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.565898e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.093653e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.093653e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.826189 sec - 7,861,694,964 cycles # 2.776 GHz - 15,087,354,653 instructions # 1.92 insn per cycle - 2.844638276 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053255361738E-002 -Relative difference = 2.5376902468575066e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.637184e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.244046e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.244046e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.766988 sec - 6,178,543,208 cycles # 2.228 GHz - 12,245,131,195 instructions # 1.98 insn per cycle - 2.787936795 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index df038945e7..de78a5beb5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:34:57 +DATE: 2024-01-31_14:20:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.293279e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.189438e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.292426e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.574531 sec - 2,278,103,742 cycles # 2.838 GHz - 3,559,192,155 instructions # 1.56 insn per cycle - 0.862169679 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.804352e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.871683e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.544845e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.541034 sec + 15,015,745,431 cycles:u # 3.287 GHz (74.88%) + 53,650,626 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.87%) + 7,018,832,750 stalled-cycles-backend:u # 46.74% backend cycles idle (74.79%) + 11,152,740,261 instructions:u # 0.74 insn per cycle + # 0.63 stalled cycles per insn (74.94%) + 4.596556877 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.401205e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.755017e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.755017e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.863284 sec - 13,757,936,316 cycles # 2.826 GHz - 37,850,126,745 instructions # 2.75 insn per cycle - 4.870249581 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.929309e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.375070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.375070e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.936701 sec + 13,183,554,443 cycles:u # 3.326 GHz (74.97%) + 39,532,639 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.97%) + 1,219,014,067 stalled-cycles-backend:u # 9.25% backend cycles idle (74.98%) + 38,014,245,383 instructions:u # 2.88 insn per cycle + # 0.03 stalled cycles per insn (74.99%) + 3.966500451 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039414671366E-002 -Relative difference = 4.562884388571957e-08 +Avg ME (F77/C++) = 1.2828039543819614E-002 +Relative difference = 3.5561191488957804e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.651233e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.514070e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.514070e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.748031 sec - 7,929,384,882 cycles # 2.881 GHz - 18,604,713,730 instructions # 2.35 insn per cycle - 2.754502860 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.453671e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.336262e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.336262e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.495200 sec + 8,136,309,828 cycles:u # 3.225 GHz (74.97%) + 41,422,914 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.95%) + 225,227,499 stalled-cycles-backend:u # 2.77% backend cycles idle (74.95%) + 18,686,478,479 instructions:u # 2.30 insn per cycle + # 0.01 stalled cycles per insn (74.97%) + 2.526474400 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.730630e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.541231e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.541231e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.679636 sec - 7,420,774,430 cycles # 2.764 GHz - 14,339,383,869 instructions # 1.93 insn per cycle - 2.686088553 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.861407e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.019102e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.019102e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.304154 sec + 7,466,062,192 cycles:u # 3.202 GHz (74.99%) + 43,746,325 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.96%) + 1,067,727,519 stalled-cycles-backend:u # 14.30% backend cycles idle (74.96%) + 14,266,963,445 instructions:u # 1.91 insn per cycle + # 0.07 stalled cycles per insn (74.81%) + 2.335449030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053246266791E-002 -Relative difference = 2.5306003563303186e-07 +Avg ME (F77/C++) = 1.2828053337216261E-002 +Relative difference = 2.601499261602198e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.796396e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.739468e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.739468e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.625810 sec - 7,304,334,176 cycles # 2.778 GHz - 13,955,275,285 instructions # 1.91 insn per cycle - 2.632447793 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053277189611E-002 -Relative difference = 2.5547059841227576e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.601296e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.146430e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.146430e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.796781 sec - 6,273,154,150 cycles # 2.239 GHz - 13,210,323,797 instructions # 2.11 insn per cycle - 2.803318258 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052540498902E-002 -Relative difference = 1.980424851420537e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 784101060d..95bb38adb1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_05:35:26 +DATE: 2024-01-31_14:20:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.300997e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192378e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.323768e+09 ) sec^-1 -MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.574497 sec - 2,274,789,999 cycles # 2.831 GHz - 3,565,149,005 instructions # 1.57 insn per cycle - 0.863293975 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.836133e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.902950e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.587351e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 +TOTAL : 4.536960 sec + 14,978,317,252 cycles:u # 3.280 GHz (75.01%) + 53,404,800 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.03%) + 6,945,859,333 stalled-cycles-backend:u # 46.37% backend cycles idle (74.97%) + 11,211,979,977 instructions:u # 0.75 insn per cycle + # 0.62 stalled cycles per insn (74.95%) + 4.593705124 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828112125134794E-002 -Relative difference = 7.1815552823662555e-06 +Avg ME (F77/CUDA) = 1.2828036033170065E-002 +Relative difference = 1.2498553996774023e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.974769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.758467e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.758467e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.560083 sec - 10,128,258,424 cycles # 2.841 GHz - 28,399,859,483 instructions # 2.80 insn per cycle - 3.566485849 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.677087e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.618912e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.618912e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 +TOTAL : 3.026740 sec + 9,998,889,028 cycles:u # 3.274 GHz (74.87%) + 38,455,022 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.90%) + 29,286,585 stalled-cycles-backend:u # 0.29% backend cycles idle (75.03%) + 28,571,948,254 instructions:u # 2.86 insn per cycle + # 0.00 stalled cycles per insn (75.12%) + 3.056524855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039441956207E-002 -Relative difference = 4.35018750695023e-08 +Avg ME (F77/C++) = 1.2828039569285465E-002 +Relative difference = 3.357602059382168e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.921662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.360866e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.360866e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.529327 sec - 7,292,501,410 cycles # 2.880 GHz - 16,787,289,445 instructions # 2.30 insn per cycle - 2.535811154 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.843963e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.258267e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.258267e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 +TOTAL : 2.313249 sec + 7,469,142,741 cycles:u # 3.191 GHz (75.00%) + 40,231,675 stalled-cycles-frontend:u # 0.54% frontend cycles idle (75.05%) + 31,915,046 stalled-cycles-backend:u # 0.43% backend cycles idle (75.05%) + 16,932,716,531 instructions:u # 2.27 insn per cycle + # 0.00 stalled cycles per insn (75.06%) + 2.344830688 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039280066150E-002 -Relative difference = 5.612189004572479e-08 +Avg ME (F77/C++) = 1.2828039385567536E-002 +Relative difference = 4.7897610623017996e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.902980e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.008268e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.008268e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.546290 sec - 7,099,294,688 cycles # 2.783 GHz - 13,729,465,706 instructions # 1.93 insn per cycle - 2.552602290 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.056970e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.511923e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.511923e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 2.225961 sec + 7,192,051,533 cycles:u # 3.191 GHz (74.90%) + 42,074,082 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.82%) + 370,974,550 stalled-cycles-backend:u # 5.16% backend cycles idle (74.85%) + 13,657,358,681 instructions:u # 1.90 insn per cycle + # 0.03 stalled cycles per insn (75.03%) + 2.258560302 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 +Avg ME (F77/C++) = 1.2828053331759293E-002 +Relative difference = 2.597245327285885e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.894124e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.023412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.023412e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.549860 sec - 7,037,352,059 cycles # 2.755 GHz - 13,462,222,302 instructions # 1.91 insn per cycle - 2.556338558 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053198973066E-002 -Relative difference = 2.4937329255889414e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.741921e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.505340e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.505340e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.671598 sec - 6,046,764,080 cycles # 2.259 GHz - 12,911,501,907 instructions # 2.14 insn per cycle - 2.677952936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052431359538E-002 -Relative difference = 1.895346165094282e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 7a09642823..b83f428d97 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_04:54:00 +DATE: 2024-01-31_13:49:58 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.434258e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.281519e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.171049e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.704261 sec - 2,701,570,097 cycles # 2.831 GHz - 4,244,340,283 instructions # 1.57 insn per cycle - 1.033944641 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.293135e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.114268e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.342215e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.657555 sec + 15,365,174,038 cycles:u # 3.275 GHz (74.94%) + 53,634,970 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.94%) + 6,932,864,082 stalled-cycles-backend:u # 45.12% backend cycles idle (75.01%) + 11,485,086,301 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (75.10%) + 4.716089042 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590281E-002 +Relative difference = 7.67145406542181e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.829628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.139787e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.139787e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.819159 sec - 19,690,827,956 cycles # 2.885 GHz - 46,971,779,576 instructions # 2.39 insn per cycle - 6.832663552 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.242473e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.418692e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.418692e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.800763 sec + 19,618,512,463 cycles:u # 3.363 GHz (74.95%) + 48,717,247 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.02%) + 149,768,230 stalled-cycles-backend:u # 0.76% backend cycles idle (75.04%) + 47,103,320,911 instructions:u # 2.40 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 5.835627988 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.605344e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.116934e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.116934e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.334479 sec - 12,518,471,325 cycles # 2.884 GHz - 30,922,888,427 instructions # 2.47 insn per cycle - 4.354467708 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.978746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.509635e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.509635e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.916740 sec + 13,061,630,358 cycles:u # 3.308 GHz (74.88%) + 53,677,065 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.92%) + 2,198,728,312 stalled-cycles-backend:u # 16.83% backend cycles idle (75.02%) + 30,732,838,719 instructions:u # 2.35 insn per cycle + # 0.07 stalled cycles per insn (75.09%) + 3.952822018 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.917239e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.660472e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.660472e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.702387 sec - 10,174,876,030 cycles # 2.745 GHz - 19,548,406,942 instructions # 1.92 insn per cycle - 3.720275920 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.584854e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.407963e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.407963e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.170189 sec + 10,407,251,362 cycles:u # 3.250 GHz (74.95%) + 50,130,427 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) + 903,788,182 stalled-cycles-backend:u # 8.68% backend cycles idle (75.02%) + 19,380,592,287 instructions:u # 1.86 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 3.206228498 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.029293e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.888276e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.888276e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.515786 sec - 9,723,051,646 cycles # 2.761 GHz - 18,859,468,530 instructions # 1.94 insn per cycle - 3.531121351 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.839848e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.512898e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.512898e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.838759 sec - 8,110,381,366 cycles # 2.110 GHz - 14,814,382,883 instructions # 1.83 insn per cycle - 3.856049832 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 385e9ed225..d77c48d5b0 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-30_04:54:36 +DATE: 2024-01-31_13:50:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.428632e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.291557e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.197877e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.704173 sec - 2,700,513,236 cycles # 2.833 GHz - 4,160,757,344 instructions # 1.54 insn per cycle - 1.040080983 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.862136e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.572230e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.892440e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.648744 sec + 15,325,650,566 cycles:u # 3.273 GHz (74.92%) + 53,884,227 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.97%) + 6,927,612,166 stalled-cycles-backend:u # 45.20% backend cycles idle (75.07%) + 11,502,059,340 instructions:u # 0.75 insn per cycle + # 0.60 stalled cycles per insn (75.06%) + 4.708036945 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590279E-002 -Relative difference = 7.671454200650844e-09 +Avg ME (F77/CUDA) = 1.2828039901590284E-002 +Relative difference = 7.67145379496374e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.048898e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.230601e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.230601e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.414952 sec - 18,538,807,361 cycles # 2.888 GHz - 44,591,647,960 instructions # 2.41 insn per cycle - 6.426389730 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.312263e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512184e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512184e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 5.529721 sec + 18,672,385,681 cycles:u # 3.358 GHz (74.97%) + 51,499,925 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) + 50,915,058 stalled-cycles-backend:u # 0.27% backend cycles idle (74.98%) + 44,718,089,446 instructions:u # 2.39 insn per cycle + # 0.00 stalled cycles per insn (74.98%) + 5.564148944 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.655305e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204388e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204388e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.214890 sec - 12,207,966,974 cycles # 2.892 GHz - 30,217,340,923 instructions # 2.48 insn per cycle - 4.236133486 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.023957e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.583519e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.583519e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.842400 sec + 12,751,330,882 cycles:u # 3.291 GHz (74.96%) + 49,158,998 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) + 1,837,349,313 stalled-cycles-backend:u # 14.41% backend cycles idle (75.02%) + 30,166,434,355 instructions:u # 2.37 insn per cycle + # 0.06 stalled cycles per insn (75.02%) + 3.878606955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.899712e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.627205e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.627205e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.730288 sec - 10,158,219,608 cycles # 2.719 GHz - 19,037,132,874 instructions # 1.87 insn per cycle - 3.746558078 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.607314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.449826e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.449826e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.147113 sec + 10,343,092,922 cycles:u # 3.253 GHz (74.86%) + 50,008,630 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.98%) + 288,268,733 stalled-cycles-backend:u # 2.79% backend cycles idle (75.09%) + 18,718,552,499 instructions:u # 1.81 insn per cycle + # 0.02 stalled cycles per insn (75.09%) + 3.183147608 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.048047e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.931283e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.931283e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.492411 sec - 9,571,391,969 cycles # 2.738 GHz - 18,453,150,608 instructions # 1.93 insn per cycle - 3.509341045 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.170414e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.170487e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.170487e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.317793 sec - 7,240,072,684 cycles # 2.179 GHz - 13,244,781,040 instructions # 1.83 insn per cycle - 3.341198784 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039951670679E-002 -Relative difference = 3.767475112924841e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 2453732bed..8fce5dda32 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_04:55:10 +DATE: 2024-01-31_13:50:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.010275e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.133419e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272295e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.538830 sec - 2,187,358,219 cycles # 2.824 GHz - 3,139,905,445 instructions # 1.44 insn per cycle - 0.856073288 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.775167e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.954588e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.009008e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.065658 sec + 3,204,556,883 cycles:u # 2.916 GHz (75.27%) + 10,676,524 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.27%) + 1,170,364,335 stalled-cycles-backend:u # 36.52% backend cycles idle (75.11%) + 2,993,337,490 instructions:u # 0.93 insn per cycle + # 0.39 stalled cycles per insn (75.05%) + 1.123586308 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.073581e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.135755e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.135755e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.168677 sec - 14,980,961,047 cycles # 2.896 GHz - 38,724,485,120 instructions # 2.58 insn per cycle - 5.178651966 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.517443e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.582929e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.582929e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.347170 sec + 14,992,597,035 cycles:u # 3.423 GHz (74.98%) + 9,832,695 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) + 755,893,575 stalled-cycles-backend:u # 5.04% backend cycles idle (74.99%) + 38,737,774,126 instructions:u # 2.58 insn per cycle + # 0.02 stalled cycles per insn (74.99%) + 4.382868805 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.523460e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.721558e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.721558e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.090012 sec - 8,952,192,290 cycles # 2.893 GHz - 24,430,503,496 instructions # 2.73 insn per cycle - 3.108451490 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.486268e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711661e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.711661e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.519670 sec + 8,575,904,466 cycles:u # 3.359 GHz (74.93%) + 9,207,162 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.95%) + 198,649,396 stalled-cycles-backend:u # 2.32% backend cycles idle (74.95%) + 24,465,619,820 instructions:u # 2.85 insn per cycle + # 0.01 stalled cycles per insn (74.94%) + 2.556908642 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.390626e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.850527e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.850527e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.056967 sec - 5,535,228,908 cycles # 2.683 GHz - 11,562,552,185 instructions # 2.09 insn per cycle - 2.068379535 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.682012e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.275744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.275744e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.547318 sec + 5,200,461,740 cycles:u # 3.290 GHz (74.70%) + 9,378,138 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.78%) + 1,061,960,703 stalled-cycles-backend:u # 20.42% backend cycles idle (75.02%) + 11,471,924,648 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.21%) + 1.584729817 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.323214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.965355e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.965355e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.769440 sec - 4,825,692,035 cycles # 2.719 GHz - 10,341,008,591 instructions # 2.14 insn per cycle - 1.786949030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.039053e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.289363e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.289363e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.707049 sec - 4,944,236,176 cycles # 1.822 GHz - 7,554,838,116 instructions # 1.53 insn per cycle - 2.726854934 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index adcfa48462..2ef95ac563 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:46:39 +DATE: 2024-01-31_14:39:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.344134e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.848581e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.848581e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.830257 sec - 3,050,711,174 cycles # 2.837 GHz - 4,744,287,151 instructions # 1.56 insn per cycle - 1.134543078 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.960794e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.780103e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.780103e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.233984 sec + 3,742,212,800 cycles:u # 2.937 GHz (74.94%) + 21,590,088 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.91%) + 13,609,560 stalled-cycles-backend:u # 0.36% backend cycles idle (74.90%) + 3,915,240,315 instructions:u # 1.05 insn per cycle + # 0.01 stalled cycles per insn (74.83%) + 1.300304990 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.051768e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.112380e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.112380e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.299158 sec - 15,311,911,023 cycles # 2.886 GHz - 38,783,796,929 instructions # 2.53 insn per cycle - 5.307164517 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.507286e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.571820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.571820e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.444725 sec + 15,123,005,974 cycles:u # 3.369 GHz (74.95%) + 9,450,189 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.03%) + 1,114,462,839 stalled-cycles-backend:u # 7.37% backend cycles idle (74.97%) + 38,884,708,432 instructions:u # 2.57 insn per cycle + # 0.03 stalled cycles per insn (74.98%) + 4.491345952 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.466739e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.657869e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.657869e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.216841 sec - 9,297,524,138 cycles # 2.885 GHz - 24,613,723,387 instructions # 2.65 insn per cycle - 3.224967553 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.456681e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.678201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.678201e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.622127 sec + 8,746,750,895 cycles:u # 3.280 GHz (74.80%) + 10,047,928 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.93%) + 222,114,170 stalled-cycles-backend:u # 2.54% backend cycles idle (75.07%) + 24,624,407,451 instructions:u # 2.82 insn per cycle + # 0.01 stalled cycles per insn (75.13%) + 2.670262132 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.363369e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.815752e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.815752e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.149413 sec - 5,860,102,645 cycles # 2.720 GHz - 11,849,599,468 instructions # 2.02 insn per cycle - 2.157292568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 7.577155e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.154338e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.154338e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.655053 sec + 5,347,323,924 cycles:u # 3.147 GHz (74.85%) + 10,320,638 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.96%) + 1,075,680,183 stalled-cycles-backend:u # 20.12% backend cycles idle (75.07%) + 11,838,991,239 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.09%) + 1.704077905 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.162124e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.773170e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.773170e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.893501 sec - 5,161,881,245 cycles # 2.717 GHz - 10,626,023,875 instructions # 2.06 insn per cycle - 1.901369932 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.945106e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.186618e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.186618e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.853446 sec - 5,298,812,686 cycles # 1.853 GHz - 7,800,536,018 instructions # 1.47 insn per cycle - 2.861501356 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index b23b4b948e..933d7c92ad 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_06:00:31 +DATE: 2024-01-31_14:53:21 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.565155e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155605e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269580e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.670915e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.961936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.016085e+07 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.625620 sec - 2,433,553,514 cycles # 2.839 GHz - 3,531,317,325 instructions # 1.45 insn per cycle - 0.914840106 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 1.067979 sec + 3,228,626,800 cycles:u # 2.930 GHz (74.62%) + 10,712,797 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.91%) + 1,139,960,341 stalled-cycles-backend:u # 35.31% backend cycles idle (75.37%) + 2,973,885,711 instructions:u # 0.92 insn per cycle + # 0.38 stalled cycles per insn (75.32%) + 1.123409557 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.073060e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.134707e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.134707e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.514410e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.579489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579489e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.229961 sec - 15,157,435,498 cycles # 2.896 GHz - 38,739,723,091 instructions # 2.56 insn per cycle - 5.236486145 seconds time elapsed +TOTAL : 4.352579 sec + 15,017,071,691 cycles:u # 3.424 GHz (75.01%) + 10,104,870 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) + 1,131,250,444 stalled-cycles-backend:u # 7.53% backend cycles idle (75.01%) + 38,711,980,403 instructions:u # 2.58 insn per cycle + # 0.03 stalled cycles per insn (74.93%) + 4.388124543 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.526696e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.723798e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.723798e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.486799e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.712857e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.712857e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.143207 sec - 9,122,833,846 cycles # 2.898 GHz - 24,428,638,513 instructions # 2.68 insn per cycle - 3.149727451 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.519538 sec + 8,573,730,911 cycles:u # 3.359 GHz (74.95%) + 9,322,072 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.96%) + 200,778,264 stalled-cycles-backend:u # 2.34% backend cycles idle (74.94%) + 24,461,372,498 instructions:u # 2.85 insn per cycle + # 0.01 stalled cycles per insn (74.93%) + 2.555255997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.453278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.923487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.923487e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.681424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.275449e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.275449e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.094012 sec - 5,713,399,327 cycles # 2.721 GHz - 11,544,398,198 instructions # 2.02 insn per cycle - 2.100575275 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +TOTAL : 1.547574 sec + 5,204,688,501 cycles:u # 3.294 GHz (74.69%) + 9,388,391 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.70%) + 1,063,036,716 stalled-cycles-backend:u # 20.42% backend cycles idle (74.91%) + 11,504,264,567 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.15%) + 1.582904825 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.340982e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.000324e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.000324e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.826424 sec - 5,007,819,577 cycles # 2.734 GHz - 10,288,512,439 instructions # 2.05 insn per cycle - 1.833139039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.024689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.274198e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.274198e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.778758 sec - 5,115,298,192 cycles # 1.837 GHz - 7,503,411,062 instructions # 1.47 insn per cycle - 2.785395708 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 66a621d02a..3800ac2c9e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:57:01 +DATE: 2024-01-31_14:50:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.578143e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159887e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.277521e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.567703 sec - 2,256,406,335 cycles # 2.832 GHz - 3,552,290,336 instructions # 1.57 insn per cycle - 0.856591173 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 51,935,826 cycles:u # 2.391 GHz (63.20%) + 44,099 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.20%) + 647,716 stalled-cycles-backend:u # 1.25% backend cycles idle (63.20%) + 39,610,708 instructions:u # 0.76 insn per cycle + # 0.02 stalled cycles per insn (65.11%) + 0.022647752 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.061569e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.123242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.123242e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.196625 sec - 14,980,592,489 cycles # 2.880 GHz - 38,723,298,937 instructions # 2.58 insn per cycle - 5.203366404 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted + 53,696,974 cycles:u # 2.491 GHz (62.92%) + 44,914 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.92%) + 548,464 stalled-cycles-backend:u # 1.02% backend cycles idle (62.92%) + 41,043,423 instructions:u # 0.76 insn per cycle + # 0.01 stalled cycles per insn (64.63%) + 0.022851105 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.518700e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.715553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.715553e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.090850 sec - 8,946,489,145 cycles # 2.890 GHz - 24,429,263,818 instructions # 2.73 insn per cycle - 3.097198356 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted + 53,124,098 cycles:u # 2.450 GHz (63.14%) + 47,517 stalled-cycles-frontend:u # 0.09% frontend cycles idle (63.14%) + 599,506 stalled-cycles-backend:u # 1.13% backend cycles idle (63.14%) + 41,225,044 instructions:u # 0.78 insn per cycle + # 0.01 stalled cycles per insn (65.15%) + 0.023077322 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.476437e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.948509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.948509e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.025257 sec - 5,523,468,825 cycles # 2.720 GHz - 11,561,737,650 instructions # 2.09 insn per cycle - 2.031752517 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted + 54,938,080 cycles:u # 2.544 GHz (62.98%) + 43,710 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.98%) + 616,158 stalled-cycles-backend:u # 1.12% backend cycles idle (62.98%) + 39,751,973 instructions:u # 0.72 insn per cycle + # 0.02 stalled cycles per insn (64.78%) + 0.022926913 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.358069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.007551e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.007551e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.759129 sec - 4,801,841,802 cycles # 2.722 GHz - 10,338,992,386 instructions # 2.15 insn per cycle - 1.765685267 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.036811e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.287808e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.287808e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.707972 sec - 4,942,835,417 cycles # 1.822 GHz - 7,554,452,946 instructions # 1.53 insn per cycle - 2.714536601 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index defb46a739..736354f8c0 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:53:36 +DATE: 2024-01-31_14:47:17 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.688012e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154108e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269539e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.720498 sec - 2,707,766,521 cycles # 2.848 GHz - 4,278,662,893 instructions # 1.58 insn per cycle - 1.009865256 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.832332e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.949244e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.003292e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.180540 sec + 3,633,951,217 cycles:u # 2.981 GHz (75.06%) + 21,429,347 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.09%) + 1,154,914,290 stalled-cycles-backend:u # 31.78% backend cycles idle (75.12%) + 3,842,620,949 instructions:u # 1.06 insn per cycle + # 0.30 stalled cycles per insn (75.09%) + 1.235886270 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.066193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.127901e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.127901e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.184582 sec - 14,984,631,159 cycles # 2.888 GHz - 38,723,388,390 instructions # 2.58 insn per cycle - 5.191155299 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.514363e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.579371e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579371e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.352130 sec + 15,020,504,376 cycles:u # 3.425 GHz (74.96%) + 10,169,638 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) + 1,098,660,659 stalled-cycles-backend:u # 7.31% backend cycles idle (75.01%) + 38,671,250,350 instructions:u # 2.57 insn per cycle + # 0.03 stalled cycles per insn (75.02%) + 4.388382110 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.511860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.708377e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.708377e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.095761 sec - 8,950,231,816 cycles # 2.886 GHz - 24,430,052,071 instructions # 2.73 insn per cycle - 3.102564983 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.492938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.721485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.721485e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.515981 sec + 8,578,848,652 cycles:u # 3.365 GHz (74.92%) + 9,425,944 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.91%) + 196,079,645 stalled-cycles-backend:u # 2.29% backend cycles idle (74.90%) + 24,441,818,243 instructions:u # 2.85 insn per cycle + # 0.01 stalled cycles per insn (74.97%) + 2.551686085 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.454029e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.925499e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.925499e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.033615 sec - 5,531,582,240 cycles # 2.713 GHz - 11,562,288,179 instructions # 2.09 insn per cycle - 2.040383969 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.684584e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.279871e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.279871e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.547039 sec + 5,203,617,968 cycles:u # 3.293 GHz (74.69%) + 9,371,257 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.69%) + 1,063,000,473 stalled-cycles-backend:u # 20.43% backend cycles idle (74.93%) + 11,490,848,219 instructions:u # 2.21 insn per cycle + # 0.09 stalled cycles per insn (75.19%) + 1.582473684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.327959e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.977069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.977069e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.768254 sec - 4,816,907,251 cycles # 2.716 GHz - 10,339,308,595 instructions # 2.15 insn per cycle - 1.774968996 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.992436e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.241387e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.241387e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.737995 sec - 4,943,973,305 cycles # 1.803 GHz - 7,555,690,658 instructions # 1.53 insn per cycle - 2.744582139 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index fe6f195aa6..eeb8545967 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_04:55:39 +DATE: 2024-01-31_13:51:21 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.125481e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158117e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273663e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.534485 sec - 2,191,778,134 cycles # 2.834 GHz - 3,140,951,827 instructions # 1.43 insn per cycle - 0.850685752 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.811456e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.916789e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.969732e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.063480 sec + 3,238,472,233 cycles:u # 2.948 GHz (74.73%) + 10,776,735 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.74%) + 1,146,001,929 stalled-cycles-backend:u # 35.39% backend cycles idle (74.94%) + 2,974,660,974 instructions:u # 0.92 insn per cycle + # 0.39 stalled cycles per insn (74.89%) + 1.119704313 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.109309e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.173415e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.173415e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.081724 sec - 14,685,294,357 cycles # 2.887 GHz - 39,544,026,748 instructions # 2.69 insn per cycle - 5.093038112 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.435903e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.497119e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.497119e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.487080 sec + 15,515,065,184 cycles:u # 3.433 GHz (74.87%) + 10,157,980 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.96%) + 13,998,426 stalled-cycles-backend:u # 0.09% backend cycles idle (75.05%) + 39,485,501,686 instructions:u # 2.54 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 4.522693165 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.661768e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.875473e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.875473e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.973090 sec - 8,600,238,365 cycles # 2.886 GHz - 23,576,508,735 instructions # 2.74 insn per cycle - 2.991032269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.393621e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.610679e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.610679e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.568675 sec + 8,779,562,714 cycles:u # 3.375 GHz (74.79%) + 10,583,458 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.94%) + 1,218,655,804 stalled-cycles-backend:u # 13.88% backend cycles idle (75.09%) + 23,482,387,475 instructions:u # 2.67 insn per cycle + # 0.05 stalled cycles per insn (75.10%) + 2.605973588 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.966204e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.352181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.352181e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.222703 sec - 5,964,350,122 cycles # 2.676 GHz - 13,193,903,385 instructions # 2.21 insn per cycle - 2.290428549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.904717e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.380877e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.380877e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.700922 sec + 5,712,275,197 cycles:u # 3.294 GHz (74.88%) + 9,340,024 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.10%) + 997,171,641 stalled-cycles-backend:u # 17.46% backend cycles idle (75.10%) + 13,125,301,039 instructions:u # 2.30 insn per cycle + # 0.08 stalled cycles per insn (75.10%) + 1.738237090 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.425705e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.897406e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.897406e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.043603 sec - 5,539,021,528 cycles # 2.702 GHz - 12,103,311,893 instructions # 2.19 insn per cycle - 2.060365335 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.662802e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.870728e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.870728e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.974749 sec - 5,366,303,915 cycles # 1.800 GHz - 9,381,926,109 instructions # 1.75 insn per cycle - 2.994553633 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 8cd37966a9..087aae64d6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:35:53 +DATE: 2024-01-31_14:21:00 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.561376e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154966e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270589e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.529335 sec - 2,159,762,911 cycles # 2.829 GHz - 3,107,803,545 instructions # 1.44 insn per cycle - 0.822533200 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.761044e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.954770e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.008707e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.068074 sec + 3,269,729,629 cycles:u # 2.969 GHz (75.15%) + 10,826,176 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.86%) + 1,132,096,656 stalled-cycles-backend:u # 34.62% backend cycles idle (74.78%) + 3,039,323,944 instructions:u # 0.93 insn per cycle + # 0.37 stalled cycles per insn (74.62%) + 1.127599788 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.227004e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.298943e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298943e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.818572 sec - 13,907,927,893 cycles # 2.883 GHz - 35,849,684,316 instructions # 2.58 insn per cycle - 4.825096940 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.851563e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.936112e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.936112e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.858759 sec + 13,289,116,499 cycles:u # 3.415 GHz (74.94%) + 9,877,100 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) + 561,229,808 stalled-cycles-backend:u # 4.22% backend cycles idle (74.92%) + 35,881,093,632 instructions:u # 2.70 insn per cycle + # 0.02 stalled cycles per insn (74.93%) + 3.894497837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.848483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.087109e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.087109e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.835129 sec - 8,213,185,511 cycles # 2.892 GHz - 21,908,282,308 instructions # 2.67 insn per cycle - 2.841971377 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.420765e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.642646e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.642646e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.552803 sec + 8,706,839,853 cycles:u # 3.367 GHz (74.99%) + 10,160,636 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.95%) + 2,349,532,908 stalled-cycles-backend:u # 26.98% backend cycles idle (74.97%) + 21,909,575,708 instructions:u # 2.52 insn per cycle + # 0.11 stalled cycles per insn (74.96%) + 2.590046894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.473983e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.948336e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.948336e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.025782 sec - 5,530,364,572 cycles # 2.723 GHz - 12,076,349,288 instructions # 2.18 insn per cycle - 2.032542267 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.666808e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.109706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.109706e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.755370 sec + 5,890,439,803 cycles:u # 3.293 GHz (74.96%) + 9,331,242 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.97%) + 2,249,642,175 stalled-cycles-backend:u # 38.19% backend cycles idle (75.00%) + 12,103,314,118 instructions:u # 2.05 insn per cycle + # 0.19 stalled cycles per insn (74.99%) + 1.792746465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.936500e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.499652e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.499652e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.876359 sec - 5,112,015,535 cycles # 2.716 GHz - 11,141,551,976 instructions # 2.18 insn per cycle - 1.883163972 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.149105e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.416003e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.416003e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.637284 sec - 4,829,728,502 cycles # 1.827 GHz - 8,842,382,666 instructions # 1.83 insn per cycle - 2.644418009 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 8eec31c0d3..b62288ac7e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:36:21 +DATE: 2024-01-31_14:21:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.565410e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.157958e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274503e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.528794 sec - 2,178,979,969 cycles # 2.840 GHz - 3,111,172,536 instructions # 1.43 insn per cycle - 0.825662442 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.783075e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.914487e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.967342e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.064985 sec + 3,240,436,157 cycles:u # 2.942 GHz (74.55%) + 10,749,114 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.20%) + 1,139,603,683 stalled-cycles-backend:u # 35.17% backend cycles idle (75.38%) + 3,002,755,196 instructions:u # 0.93 insn per cycle + # 0.38 stalled cycles per insn (75.37%) + 1.124308464 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516822 -Relative difference = 3.2588034143755247e-07 +Avg ME (F77/CUDA) = 2.0288063388516817 +Relative difference = 3.258803416564443e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.483554e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.573797e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.573797e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.331634 sec - 12,513,147,299 cycles # 2.885 GHz - 35,729,824,625 instructions # 2.86 insn per cycle - 4.338115382 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.211326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.318376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.318376e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 3.447382 sec + 11,836,561,603 cycles:u # 3.401 GHz (74.95%) + 9,684,924 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) + 23,063,598 stalled-cycles-backend:u # 0.19% backend cycles idle (74.97%) + 35,762,758,230 instructions:u # 3.02 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 3.483054665 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.944859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.193242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.193242e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.766913 sec - 8,026,265,535 cycles # 2.895 GHz - 21,260,291,484 instructions # 2.65 insn per cycle - 2.773559046 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.808666e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.070704e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.070704e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.362709 sec + 8,036,718,287 cycles:u # 3.354 GHz (75.01%) + 10,456,947 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.96%) + 1,748,701,910 stalled-cycles-backend:u # 21.76% backend cycles idle (74.98%) + 21,246,867,161 instructions:u # 2.64 insn per cycle + # 0.08 stalled cycles per insn (74.99%) + 2.400320396 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.719292e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.240372e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.240372e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.943097 sec - 5,300,809,350 cycles # 2.722 GHz - 11,405,959,044 instructions # 2.15 insn per cycle - 1.950186269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.914365e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.546212e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.546212e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.507943 sec + 5,027,160,241 cycles:u # 3.261 GHz (74.92%) + 9,342,894 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.09%) + 303,402,377 stalled-cycles-backend:u # 6.04% backend cycles idle (75.09%) + 11,346,307,110 instructions:u # 2.26 insn per cycle + # 0.03 stalled cycles per insn (75.10%) + 1.545294997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.116224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.720108e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.720108e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.828206 sec - 4,977,318,735 cycles # 2.718 GHz - 10,599,506,112 instructions # 2.13 insn per cycle - 1.834822870 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.275159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.556705e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.556705e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.563497 sec - 4,703,376,134 cycles # 1.831 GHz - 8,567,908,292 instructions # 1.82 insn per cycle - 2.570320519 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516204 -Relative difference = 3.2588037186351226e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 03334a40e8..9d33924327 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_04:56:08 +DATE: 2024-01-31_13:51:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.266078e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.583524e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.962786e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.486964 sec - 2,022,378,491 cycles # 2.826 GHz - 2,872,554,108 instructions # 1.42 insn per cycle - 0.794836465 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.865799e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.872508e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.028078e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.013471 sec + 3,120,326,875 cycles:u # 2.992 GHz (74.66%) + 10,877,225 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.66%) + 1,161,438,597 stalled-cycles-backend:u # 37.22% backend cycles idle (75.03%) + 2,808,430,330 instructions:u # 0.90 insn per cycle + # 0.41 stalled cycles per insn (75.40%) + 1.068198918 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.220233e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.293728e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.293728e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.812282 sec - 13,901,639,181 cycles # 2.885 GHz - 37,078,732,469 instructions # 2.67 insn per cycle - 4.824222975 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.977157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.066362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.066362e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.666908 sec + 12,716,669,353 cycles:u # 3.442 GHz (74.90%) + 7,108,813 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) + 12,449,971 stalled-cycles-backend:u # 0.10% backend cycles idle (74.92%) + 37,088,931,898 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 3.696733758 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.150516e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.595808e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.595808e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.124737 sec - 6,168,101,005 cycles # 2.895 GHz - 15,212,489,109 instructions # 2.47 insn per cycle - 2.142108549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.083038e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.487057e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.487057e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.868298 sec + 6,412,469,462 cycles:u # 3.381 GHz (74.90%) + 7,184,992 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.06%) + 2,239,939,343 stalled-cycles-backend:u # 34.93% backend cycles idle (75.12%) + 15,206,507,608 instructions:u # 2.37 insn per cycle + # 0.15 stalled cycles per insn (75.12%) + 1.899876170 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.954385e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029179e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029179e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.259990 sec - 3,437,290,204 cycles # 2.715 GHz - 7,715,643,345 instructions # 2.24 insn per cycle - 1.287994689 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.221758e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379085e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379085e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.002858 sec + 3,400,909,569 cycles:u # 3.299 GHz (74.55%) + 8,045,324 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.72%) + 917,395,280 stalled-cycles-backend:u # 26.97% backend cycles idle (75.10%) + 7,672,145,231 instructions:u # 2.26 insn per cycle + # 0.12 stalled cycles per insn (75.17%) + 1.034197542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.805420e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.144112e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.144112e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.162105 sec - 3,179,163,625 cycles # 2.727 GHz - 7,109,925,739 instructions # 2.24 insn per cycle - 1.178171652 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.071814e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.862424e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.862424e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.572617 sec - 2,980,157,633 cycles # 1.888 GHz - 5,763,820,562 instructions # 1.93 insn per cycle - 1.590552097 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 3a80a864ae..6339bf0352 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,222 +1,170 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:47:08 +DATE: 2024-01-31_14:39:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.753522e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.358863e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.358863e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.684539 sec - 2,591,525,623 cycles # 2.839 GHz - 3,989,244,311 instructions # 1.54 insn per cycle - 0.972564077 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.452616e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045320e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.045320e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.157777 sec + 3,529,050,057 cycles:u # 2.961 GHz (75.18%) + 21,090,755 stalled-cycles-frontend:u # 0.60% frontend cycles idle (75.23%) + 581,377,578 stalled-cycles-backend:u # 16.47% backend cycles idle (75.21%) + 3,877,066,359 instructions:u # 1.10 insn per cycle + # 0.15 stalled cycles per insn (74.93%) + 1.216337024 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.212668e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.285744e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.285744e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.870798 sec - 14,070,285,227 cycles # 2.885 GHz - 37,122,197,019 instructions # 2.64 insn per cycle - 4.878379515 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.976508e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.066016e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.066016e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.708112 sec + 12,739,588,572 cycles:u # 3.405 GHz (74.99%) + 7,663,639 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) + 22,130,215 stalled-cycles-backend:u # 0.17% backend cycles idle (75.00%) + 37,075,008,796 instructions:u # 2.91 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 3.743899139 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.080420e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.515170e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.515170e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.198766 sec - 6,358,773,769 cycles # 2.884 GHz - 15,492,113,204 instructions # 2.44 insn per cycle - 2.206392318 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.249587e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.668513e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.668513e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.869596 sec + 6,317,353,993 cycles:u # 3.319 GHz (74.87%) + 7,706,055 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.79%) + 2,155,272,535 stalled-cycles-backend:u # 34.12% backend cycles idle (74.79%) + 15,501,273,701 instructions:u # 2.45 insn per cycle + # 0.14 stalled cycles per insn (74.98%) + 1.906860784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.787706e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.007873e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.007873e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.328722 sec - 3,633,771,509 cycles # 2.722 GHz - 7,954,097,743 instructions # 2.19 insn per cycle - 1.336366634 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.207007e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.360484e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.360484e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.058756 sec + 3,443,949,436 cycles:u # 3.152 GHz (75.02%) + 7,472,976 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.11%) + 916,987,954 stalled-cycles-backend:u # 26.63% backend cycles idle (75.11%) + 7,891,775,014 instructions:u # 2.29 insn per cycle + # 0.12 stalled cycles per insn (75.17%) + 1.096109008 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.612179e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.118037e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.118037e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.225517 sec - 3,366,927,421 cycles # 2.733 GHz - 7,347,508,752 instructions # 2.18 insn per cycle - 1.232992993 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.960005e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.722467e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.722467e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.642570 sec - 3,181,631,608 cycles # 1.930 GHz - 6,021,725,956 instructions # 1.89 insn per cycle - 1.650041277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 38a7216065..b9b451f7a7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_06:00:59 +DATE: 2024-01-31_14:53:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.412461e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.631522e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951868e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.573986 sec - 2,244,851,144 cycles # 2.822 GHz - 3,300,445,554 instructions # 1.47 insn per cycle - 0.853607464 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.530039e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.873981e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.029409e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.011207 sec + 3,109,468,699 cycles:u # 2.989 GHz (74.64%) + 10,675,373 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.03%) + 1,155,728,733 stalled-cycles-backend:u # 37.17% backend cycles idle (75.40%) + 2,757,158,995 instructions:u # 0.89 insn per cycle + # 0.42 stalled cycles per insn (75.40%) + 1.063509800 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.218192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.291861e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.291861e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.876695 sec - 14,064,697,494 cycles # 2.884 GHz - 37,110,369,611 instructions # 2.64 insn per cycle - 4.882981134 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.984186e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.073806e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.073806e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.659266 sec + 12,698,234,408 cycles:u # 3.444 GHz (74.88%) + 7,066,710 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) + 12,699,589 stalled-cycles-backend:u # 0.10% backend cycles idle (75.05%) + 37,066,954,874 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 3.688889541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.131220e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.575839e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.575839e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 -TOTAL : 2.187823 sec - 6,322,431,284 cycles # 2.883 GHz - 15,223,876,723 instructions # 2.41 insn per cycle - 2.194184928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.291162e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.715010e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.715010e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.811368 sec + 6,217,112,223 cycles:u # 3.380 GHz (74.81%) + 6,897,784 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.78%) + 2,145,885,127 stalled-cycles-backend:u # 34.52% backend cycles idle (75.00%) + 15,247,265,769 instructions:u # 2.45 insn per cycle + # 0.14 stalled cycles per insn (75.00%) + 1.841031156 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.948892e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.027773e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.027773e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.320835 sec - 3,601,071,923 cycles # 2.719 GHz - 7,699,828,133 instructions # 2.14 insn per cycle - 1.327138068 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.221639e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.378458e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.378458e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.002912 sec + 3,392,615,870 cycles:u # 3.292 GHz (74.55%) + 7,908,977 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.95%) + 905,004,374 stalled-cycles-backend:u # 26.68% backend cycles idle (75.17%) + 7,653,140,390 instructions:u # 2.26 insn per cycle + # 0.12 stalled cycles per insn (75.17%) + 1.033182842 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.790537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.142626e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.142626e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.218690 sec - 3,342,798,362 cycles # 2.731 GHz - 7,059,572,278 instructions # 2.11 insn per cycle - 1.225217680 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.022088e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.806836e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.806836e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.641075 sec - 3,147,503,652 cycles # 1.912 GHz - 5,713,849,148 instructions # 1.82 insn per cycle - 1.647331874 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index cb54d3236b..0f700f95ab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,209 +1,133 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:57:29 +DATE: 2024-01-31_14:50:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.414196e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.655173e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.981062e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.514856 sec - 2,089,539,478 cycles # 2.840 GHz - 3,296,506,746 instructions # 1.58 insn per cycle - 0.794530995 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 53,570,294 cycles:u # 2.465 GHz (63.21%) + 44,370 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.22%) + 600,308 stalled-cycles-backend:u # 1.12% backend cycles idle (63.22%) + 41,293,137 instructions:u # 0.77 insn per cycle + # 0.01 stalled cycles per insn (65.24%) + 0.022677751 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.227183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.300870e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.300870e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.796202 sec - 13,896,514,461 cycles # 2.894 GHz - 37,078,595,071 instructions # 2.67 insn per cycle - 4.803618427 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted + 54,888,405 cycles:u # 2.552 GHz (62.84%) + 34,498 stalled-cycles-frontend:u # 0.06% frontend cycles idle (62.84%) + 614,835 stalled-cycles-backend:u # 1.12% backend cycles idle (62.84%) + 39,911,154 instructions:u # 0.73 insn per cycle + # 0.02 stalled cycles per insn (64.62%) + 0.022797442 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.077190e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.527451e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.527451e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.154367 sec - 6,177,704,022 cycles # 2.870 GHz - 15,215,532,210 instructions # 2.46 insn per cycle - 2.160620609 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.911398e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.023141e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.023141e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.265022 sec - 3,447,761,650 cycles # 2.714 GHz - 7,715,058,636 instructions # 2.24 insn per cycle - 1.271511064 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted + 54,129,943 cycles:u # 2.514 GHz (62.87%) + 44,123 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.88%) + 616,440 stalled-cycles-backend:u # 1.14% backend cycles idle (62.88%) + 40,740,714 instructions:u # 0.75 insn per cycle + # 0.02 stalled cycles per insn (64.53%) + 0.022769848 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.829060e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.147412e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.147412e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.155347 sec - 3,170,001,813 cycles # 2.731 GHz - 7,109,524,161 instructions # 2.24 insn per cycle - 1.161808340 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted + 51,261,943 cycles:u # 2.382 GHz (62.86%) + 41,540 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.87%) + 557,588 stalled-cycles-backend:u # 1.09% backend cycles idle (62.87%) + 43,301,886 instructions:u # 0.84 insn per cycle + # 0.01 stalled cycles per insn (64.72%) + 0.022831988 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.999480e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.774350e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.774350e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.586909 sec - 2,978,718,352 cycles # 1.871 GHz - 5,762,941,941 instructions # 1.93 insn per cycle - 1.593095591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 5939268227..874a0d9227 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,211 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:54:04 +DATE: 2024-01-31_14:47:40 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.468280e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.632924e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.955888e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.625640 sec - 2,402,533,839 cycles # 2.841 GHz - 3,758,306,095 instructions # 1.56 insn per cycle - 0.905223049 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.194070e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.863045e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019637e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 +TOTAL : 1.126583 sec + 3,509,941,194 cycles:u # 3.030 GHz (75.16%) + 22,018,880 stalled-cycles-frontend:u # 0.63% frontend cycles idle (75.21%) + 1,148,370,067 stalled-cycles-backend:u # 32.72% backend cycles idle (75.20%) + 3,765,296,391 instructions:u # 1.07 insn per cycle + # 0.30 stalled cycles per insn (75.13%) + 1.175184575 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.221197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.294941e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.294941e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.809027 sec - 13,889,421,482 cycles # 2.885 GHz - 37,078,742,557 instructions # 2.67 insn per cycle - 4.815296717 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.984226e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.074104e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.074104e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.658913 sec + 12,701,443,134 cycles:u # 3.445 GHz (74.86%) + 7,095,454 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 9,911,546 stalled-cycles-backend:u # 0.08% backend cycles idle (75.05%) + 37,066,452,270 instructions:u # 2.92 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 3.688367018 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.146065e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.592205e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.592205e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.125330 sec - 6,161,438,553 cycles # 2.892 GHz - 15,211,397,983 instructions # 2.47 insn per cycle - 2.131726868 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.286494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.712387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.712387e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.812169 sec + 6,203,283,474 cycles:u # 3.371 GHz (74.82%) + 7,177,180 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.82%) + 2,144,315,451 stalled-cycles-backend:u # 34.57% backend cycles idle (74.79%) + 15,227,021,935 instructions:u # 2.45 insn per cycle + # 0.14 stalled cycles per insn (75.01%) + 1.841698994 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.991330e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.034099e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.034099e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.255088 sec - 3,440,029,043 cycles # 2.730 GHz - 7,714,775,848 instructions # 2.24 insn per cycle - 1.261283713 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.220443e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.376990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.376990e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.004270 sec + 3,403,854,776 cycles:u # 3.298 GHz (74.43%) + 7,965,823 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.81%) + 905,514,489 stalled-cycles-backend:u # 26.60% backend cycles idle (75.19%) + 7,659,434,862 instructions:u # 2.25 insn per cycle + # 0.12 stalled cycles per insn (75.21%) + 1.034669678 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.843583e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.149362e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.149362e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.153935 sec - 3,172,826,861 cycles # 2.738 GHz - 7,109,210,779 instructions # 2.24 insn per cycle - 1.160268530 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288179996423423 -Relative difference = 1.7628858734720142e-10 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186294492334 +Relative difference = 1.826435805832187e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.077925e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.872855e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.872855e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.570762 sec - 2,979,903,068 cycles # 1.891 GHz - 5,762,829,882 instructions # 1.93 insn per cycle - 1.577195857 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183195516467 -Relative difference = 1.5750631496822894e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index c96a0bb3db..d4bb6181a9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_04:56:32 +DATE: 2024-01-31_13:52:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.421312e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.704045e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.041754e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.486618 sec - 2,018,521,842 cycles # 2.827 GHz - 2,837,894,141 instructions # 1.41 insn per cycle - 0.795623791 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 9.340799e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.048487e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.223095e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.012814 sec + 3,102,074,072 cycles:u # 2.978 GHz (74.84%) + 10,684,621 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.97%) + 1,156,397,766 stalled-cycles-backend:u # 37.28% backend cycles idle (75.14%) + 2,771,360,480 instructions:u # 0.89 insn per cycle + # 0.42 stalled cycles per insn (75.40%) + 1.068102599 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.245629e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.320181e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.320181e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.758439 sec - 13,805,800,630 cycles # 2.898 GHz - 37,480,161,839 instructions # 2.71 insn per cycle - 4.770650257 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.961291e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.049383e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.049383e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.686242 sec + 12,771,638,095 cycles:u # 3.438 GHz (75.02%) + 7,371,044 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.02%) + 9,932,633 stalled-cycles-backend:u # 0.08% backend cycles idle (75.02%) + 37,443,030,124 instructions:u # 2.93 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 3.716660029 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288197983754799 -Relative difference = 9.938019153537065e-08 +Avg ME (F77/C++) = 2.0288198367925361 +Relative difference = 8.044452636897417e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.821274e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.398672e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.398672e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.889404 sec - 5,475,292,589 cycles # 2.889 GHz - 15,244,893,114 instructions # 2.78 insn per cycle - 1.908184587 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.349724e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.935331e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.935331e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.571428 sec + 5,356,446,210 cycles:u # 3.349 GHz (75.00%) + 7,741,465 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.00%) + 1,299,135,011 stalled-cycles-backend:u # 24.25% backend cycles idle (75.00%) + 15,197,192,844 instructions:u # 2.84 insn per cycle + # 0.09 stalled cycles per insn (75.01%) + 1.602763498 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288191968575120 -Relative difference = 9.703059369476286e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198773050681 +Relative difference = 6.047600673895608e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.385813e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.037637e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.037637e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.731302 sec - 4,719,001,422 cycles # 2.717 GHz - 9,850,811,081 instructions # 2.09 insn per cycle - 1.750777348 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 8.892015e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.695000e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.695000e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.324222 sec + 4,525,402,442 cycles:u # 3.346 GHz (74.69%) + 8,007,297 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.71%) + 1,664,879,635 stalled-cycles-backend:u # 36.79% backend cycles idle (75.00%) + 9,811,549,184 instructions:u # 2.17 insn per cycle + # 0.17 stalled cycles per insn (75.16%) + 1.355868030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.683577e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.409489e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.409489e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.659075 sec - 4,492,699,411 cycles # 2.699 GHz - 9,202,452,349 instructions # 2.05 insn per cycle - 1.671352513 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180243223906 -Relative difference = 1.1988453753912676e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186428369954 +Relative difference = 1.7604478492421832e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.938211e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.486110e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.486110e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.854854 sec - 3,463,720,216 cycles # 1.861 GHz - 6,875,040,962 instructions # 1.98 insn per cycle - 1.876340349 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183217635378 -Relative difference = 1.5859655131013432e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 993f4107d6..61615af3b6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:36:48 +DATE: 2024-01-31_14:21:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.377362e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.649325e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.974675e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.484150 sec - 2,005,186,574 cycles # 2.831 GHz - 2,872,226,914 instructions # 1.43 insn per cycle - 0.768013554 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.735814e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877863e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.033538e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.012208 sec + 3,119,807,941 cycles:u # 2.993 GHz (74.63%) + 10,767,069 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.63%) + 1,158,448,795 stalled-cycles-backend:u # 37.13% backend cycles idle (75.06%) + 2,770,838,012 instructions:u # 0.89 insn per cycle + # 0.42 stalled cycles per insn (75.39%) + 1.063714508 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.479081e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.570421e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.570421e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.318821 sec - 12,411,469,267 cycles # 2.871 GHz - 34,216,954,204 instructions # 2.76 insn per cycle - 4.325006925 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.216757e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.321079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.321079e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.404693 sec + 11,796,914,940 cycles:u # 3.437 GHz (74.90%) + 6,850,731 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) + 1,685,582,811 stalled-cycles-backend:u # 14.29% backend cycles idle (75.06%) + 34,222,491,331 instructions:u # 2.90 insn per cycle + # 0.05 stalled cycles per insn (75.07%) + 3.434705962 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.935196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.540988e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.540988e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.856505 sec - 5,363,525,325 cycles # 2.881 GHz - 14,587,825,944 instructions # 2.72 insn per cycle - 1.863141926 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.229229e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.799387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.799387e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.604851 sec + 5,480,363,361 cycles:u # 3.356 GHz (74.93%) + 7,418,885 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.02%) + 2,030,766,975 stalled-cycles-backend:u # 37.06% backend cycles idle (75.02%) + 14,594,495,372 instructions:u # 2.66 insn per cycle + # 0.14 stalled cycles per insn (75.02%) + 1.635926264 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192580919713 -Relative difference = 1.2721291123071246e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198769558221 +Relative difference = 6.06481491495597e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.475828e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.385170e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.385170e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.492139 sec - 4,058,079,431 cycles # 2.710 GHz - 9,088,895,483 instructions # 2.24 insn per cycle - 1.498802038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 9.445478e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.035633e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.035633e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.255073 sec + 4,255,529,094 cycles:u # 3.316 GHz (75.10%) + 7,973,246 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.07%) + 1,640,245,882 stalled-cycles-backend:u # 38.54% backend cycles idle (75.07%) + 9,026,902,760 instructions:u # 2.12 insn per cycle + # 0.18 stalled cycles per insn (75.07%) + 1.286698152 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.052179e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.125609e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.125609e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.390912 sec - 3,795,132,868 cycles # 2.718 GHz - 8,440,638,214 instructions # 2.22 insn per cycle - 1.397579629 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180499337614 -Relative difference = 2.4612242975974814e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186752004549 +Relative difference = 1.6009291367898262e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.426211e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.889827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.889827e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.021883 sec - 3,727,709,927 cycles # 1.839 GHz - 7,572,021,248 instructions # 2.03 insn per cycle - 2.028341317 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183350348845 -Relative difference = 1.6513796936156652e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 2891f046ff..30d2d52191 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_05:37:12 +DATE: 2024-01-31_14:22:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.485748e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.689974e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.027356e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.482102 sec - 1,996,662,355 cycles # 2.812 GHz - 2,850,200,230 instructions # 1.43 insn per cycle - 0.768087139 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 8.949611e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.045213e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.219863e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 +TOTAL : 1.014593 sec + 3,118,954,529 cycles:u # 2.988 GHz (74.59%) + 10,853,428 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.98%) + 1,156,319,559 stalled-cycles-backend:u # 37.07% backend cycles idle (75.43%) + 2,823,007,789 instructions:u # 0.91 insn per cycle + # 0.41 stalled cycles per insn (75.43%) + 1.067855681 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028811e+00 -Avg ME (F77/CUDA) = 2.0288499749731272 -Relative difference = 1.9210746159747678e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028815e+00 +Avg ME (F77/CUDA) = 2.0288173652952537 +Relative difference = 1.1658506339321586e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.596095e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.696763e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.696763e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.127183 sec - 11,946,394,247 cycles # 2.891 GHz - 35,407,075,530 instructions # 2.96 insn per cycle - 4.133301161 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.443408e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.563240e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.563240e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 3.189746 sec + 11,035,685,759 cycles:u # 3.430 GHz (74.89%) + 7,526,740 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.91%) + 246,540,718 stalled-cycles-backend:u # 2.23% backend cycles idle (74.91%) + 35,471,888,877 instructions:u # 3.21 insn per cycle + # 0.01 stalled cycles per insn (74.97%) + 3.219755111 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.250434e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.927787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.927787e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.766919 sec - 5,069,845,731 cycles # 2.861 GHz - 14,044,971,447 instructions # 2.77 insn per cycle - 1.773365949 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.747202e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.406647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.406647e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 +TOTAL : 1.497749 sec + 5,106,890,668 cycles:u # 3.347 GHz (74.93%) + 7,729,401 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.84%) + 1,357,822,804 stalled-cycles-backend:u # 26.59% backend cycles idle (74.62%) + 14,100,437,399 instructions:u # 2.76 insn per cycle + # 0.10 stalled cycles per insn (74.62%) + 1.529202877 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288192554144189 -Relative difference = 1.2589315209891237e-07 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028820e+00 +Avg ME (F77/C++) = 2.0288198892958462 +Relative difference = 5.4565783974899003e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.559784e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.492213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.492213e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.476405 sec - 3,988,953,115 cycles # 2.692 GHz - 8,629,569,798 instructions # 2.16 insn per cycle - 1.482936821 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.016100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.123106e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.123106e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.176337 sec + 3,983,849,774 cycles:u # 3.308 GHz (74.50%) + 6,512,233 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) + 1,443,046,357 stalled-cycles-backend:u # 36.22% backend cycles idle (75.03%) + 8,629,208,977 instructions:u # 2.17 insn per cycle + # 0.17 stalled cycles per insn (75.10%) + 1.207645395 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.210818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.331985e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.331985e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 -TOTAL : 1.366166 sec - 3,694,176,022 cycles # 2.694 GHz - 8,100,845,822 instructions # 2.19 insn per cycle - 1.372646371 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288180815987289 -Relative difference = 4.021983692325164e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288186836987734 +Relative difference = 1.559041129563128e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.670710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.170464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.170464e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.938240 sec - 3,580,879,514 cycles # 1.843 GHz - 7,373,942,234 instructions # 2.06 insn per cycle - 1.944698982 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183569209650 -Relative difference = 1.7592557106041962e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 26cb412a69..a82d3154e4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_04:56:57 +DATE: 2024-01-31_13:52:25 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.567190e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153367e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271156e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.531785 sec - 2,166,596,506 cycles # 2.818 GHz - 3,096,992,570 instructions # 1.43 insn per cycle - 0.839064322 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.794432e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.005141e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.060273e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.064603 sec + 3,215,836,002 cycles:u # 2.927 GHz (74.53%) + 10,653,381 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.84%) + 568,951,210 stalled-cycles-backend:u # 17.69% backend cycles idle (75.30%) + 2,976,432,260 instructions:u # 0.93 insn per cycle + # 0.19 stalled cycles per insn (75.26%) + 1.119979873 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.035137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.096372e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.096372e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.263186 sec - 15,248,441,904 cycles # 2.894 GHz - 39,293,765,746 instructions # 2.58 insn per cycle - 5.273287972 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.478378e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.541836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.541836e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.412755 sec + 15,231,146,764 cycles:u # 3.426 GHz (75.00%) + 10,099,136 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) + 136,784,882 stalled-cycles-backend:u # 0.90% backend cycles idle (74.99%) + 39,286,082,116 instructions:u # 2.58 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 4.448390218 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.565129e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.766484e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.766484e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.050997 sec - 8,847,131,595 cycles # 2.894 GHz - 24,093,216,326 instructions # 2.72 insn per cycle - 3.069927720 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.548558e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.780380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.780380e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.488012 sec + 8,458,483,870 cycles:u # 3.355 GHz (74.94%) + 9,397,002 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.96%) + 888,302,300 stalled-cycles-backend:u # 10.50% backend cycles idle (74.96%) + 24,127,794,576 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (74.94%) + 2.525312813 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.446912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.914435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.914435e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.035886 sec - 5,501,574,982 cycles # 2.694 GHz - 11,449,152,902 instructions # 2.08 insn per cycle - 2.052044507 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.851235e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.471021e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.471021e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.518408 sec + 5,089,441,147 cycles:u # 3.279 GHz (74.75%) + 9,358,375 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.74%) + 466,224,607 stalled-cycles-backend:u # 9.16% backend cycles idle (74.91%) + 11,400,997,607 instructions:u # 2.24 insn per cycle + # 0.04 stalled cycles per insn (75.17%) + 1.555951668 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.398707e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.055840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.055840e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.750074 sec - 4,773,598,492 cycles # 2.718 GHz - 10,317,257,525 instructions # 2.16 insn per cycle - 1.763056572 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.115786e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.377584e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.377584e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.659793 sec - 4,851,599,101 cycles # 1.820 GHz - 7,367,812,046 instructions # 1.52 insn per cycle - 2.678537528 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 3aadf8f9be..dc3e5431af 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,209 +1,164 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-30_04:57:25 +DATE: 2024-01-31_13:52:48 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.571537e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.158030e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.273800e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.527677 sec - 2,187,527,722 cycles # 2.838 GHz - 3,113,906,107 instructions # 1.42 insn per cycle - 0.843196902 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.773626e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.913093e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.965935e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.064090 sec + 3,185,169,658 cycles:u # 2.901 GHz (75.25%) + 10,700,271 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.27%) + 1,150,129,221 stalled-cycles-backend:u # 36.11% backend cycles idle (75.27%) + 2,952,946,614 instructions:u # 0.93 insn per cycle + # 0.39 stalled cycles per insn (75.06%) + 1.123660960 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243874 -Relative difference = 3.241686432649386e-07 +Avg ME (F77/CUDA) = 2.0288063423243869 +Relative difference = 3.241686434838304e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053597e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.114429e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.114429e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.216554 sec - 15,076,935,035 cycles # 2.887 GHz - 40,115,062,840 instructions # 2.66 insn per cycle - 5.225437216 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.426536e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.487007e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.487007e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 4.502472 sec + 15,540,927,135 cycles:u # 3.427 GHz (74.95%) + 9,844,079 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 19,078,127 stalled-cycles-backend:u # 0.12% backend cycles idle (74.96%) + 40,163,707,232 instructions:u # 2.58 insn per cycle + # 0.00 stalled cycles per insn (74.96%) + 4.538214277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.498695e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.695294e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.695294e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.107625 sec - 8,698,982,275 cycles # 2.794 GHz - 23,534,504,437 instructions # 2.71 insn per cycle - 3.124975720 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.510516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.740202e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.740202e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.506920 sec + 8,569,629,800 cycles:u # 3.373 GHz (74.81%) + 10,871,179 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.86%) + 651,048,749 stalled-cycles-backend:u # 7.60% backend cycles idle (75.01%) + 23,430,751,066 instructions:u # 2.73 insn per cycle + # 0.03 stalled cycles per insn (75.13%) + 2.544191965 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.826638e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.191418e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.191418e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.282934 sec - 6,198,059,216 cycles # 2.708 GHz - 13,103,377,766 instructions # 2.11 insn per cycle - 2.300648997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.852986e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.321455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.321455e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.713280 sec + 5,760,398,352 cycles:u # 3.298 GHz (74.89%) + 9,611,570 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.84%) + 719,104,526 stalled-cycles-backend:u # 12.48% backend cycles idle (74.81%) + 13,118,757,379 instructions:u # 2.28 insn per cycle + # 0.05 stalled cycles per insn (74.84%) + 1.750550085 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.224417e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.653642e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.653642e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.117622 sec - 5,754,647,700 cycles # 2.709 GHz - 12,210,180,073 instructions # 2.12 insn per cycle - 2.133681313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.752218e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.971190e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.971190e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.905718 sec - 5,261,261,771 cycles # 1.807 GHz - 8,449,535,603 instructions # 1.61 insn per cycle - 2.918034623 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063930599014 -Relative difference = 2.9916108265801754e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 93e04f110e..2c4872a3ef 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_04:57:55 +DATE: 2024-01-31_13:53:11 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.751466e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.044991e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.059567e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.471853 sec - 1,938,631,197 cycles # 2.818 GHz - 2,775,429,754 instructions # 1.43 insn per cycle - 0.768838740 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.881857e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.034056e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041695e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.535207 sec + 1,561,352,723 cycles:u # 2.771 GHz (75.68%) + 7,936,172 stalled-cycles-frontend:u # 0.51% frontend cycles idle (76.46%) + 276,377,827 stalled-cycles-backend:u # 17.70% backend cycles idle (75.95%) + 1,807,555,937 instructions:u # 1.16 insn per cycle + # 0.15 stalled cycles per insn (75.19%) + 0.583404745 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.083310e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323559e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.337755e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.612248 sec - 2,402,912,694 cycles # 2.815 GHz - 3,669,599,520 instructions # 1.53 insn per cycle - 0.914185147 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.577002e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.847336e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.852472e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.137772 sec + 3,521,764,321 cycles:u # 2.997 GHz (74.34%) + 21,218,659 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.56%) + 853,108,865 stalled-cycles-backend:u # 24.22% backend cycles idle (74.82%) + 3,256,826,481 instructions:u # 0.92 insn per cycle + # 0.26 stalled cycles per insn (74.81%) + 1.195192794 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.436781e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.449292e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.449292e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.748293 sec - 19,527,368,133 cycles # 2.892 GHz - 57,921,410,950 instructions # 2.97 insn per cycle - 6.756473501 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.949926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.962209e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962209e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.577914 sec + 19,611,893,072 cycles:u # 3.502 GHz (74.99%) + 2,551,919 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 3,431,866,251 stalled-cycles-backend:u # 17.50% backend cycles idle (75.00%) + 57,909,311,784 instructions:u # 2.95 insn per cycle + # 0.06 stalled cycles per insn (75.00%) + 5.602758302 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.689715e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.736371e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.736371e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.517761 sec - 10,204,769,485 cycles # 2.897 GHz - 29,944,325,485 instructions # 2.93 insn per cycle - 3.533017528 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.035773e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.086870e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.086870e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.738169 sec + 9,650,787,663 cycles:u # 3.496 GHz (74.83%) + 2,652,787 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) + 2,369,218,335 stalled-cycles-backend:u # 24.55% backend cycles idle (75.08%) + 29,963,506,375 instructions:u # 3.10 insn per cycle + # 0.08 stalled cycles per insn (75.09%) + 2.764116067 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.110539e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.290286e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.290286e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.822880 sec - 4,929,256,319 cycles # 2.697 GHz - 11,212,094,634 instructions # 2.27 insn per cycle - 1.842452367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.238244e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.259535e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.259535e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.346669 sec + 4,753,803,853 cycles:u # 3.472 GHz (74.88%) + 2,041,888 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) + 1,473,673,367 stalled-cycles-backend:u # 31.00% backend cycles idle (74.88%) + 11,252,845,250 instructions:u # 2.37 insn per cycle + # 0.13 stalled cycles per insn (74.72%) + 1.372622722 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.045459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.068242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.068242e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.591153 sec - 4,310,771,194 cycles # 2.701 GHz - 10,188,135,001 instructions # 2.36 insn per cycle - 1.604477930 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.350984e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.465337e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.465337e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.255127 sec - 3,913,955,092 cycles # 1.732 GHz - 5,709,470,043 instructions # 1.46 insn per cycle - 2.269083887 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index ec4707eb36..5f5164ce2d 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_05:47:33 +DATE: 2024-01-31_14:40:14 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.528893e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.736864e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.736864e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.499721 sec - 2,019,319,467 cycles # 2.834 GHz - 3,049,308,251 instructions # 1.51 insn per cycle - 0.770515897 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.466419e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.968594e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.968594e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.563248 sec + 1,634,603,369 cycles:u # 2.762 GHz (75.78%) + 10,357,414 stalled-cycles-frontend:u # 0.63% frontend cycles idle (75.67%) + 290,360,114 stalled-cycles-backend:u # 17.76% backend cycles idle (75.69%) + 2,033,614,753 instructions:u # 1.24 insn per cycle + # 0.14 stalled cycles per insn (75.10%) + 0.614864085 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.631733e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.469522e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.469522e+06 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.838079 sec - 3,105,645,423 cycles # 2.841 GHz - 4,885,001,867 instructions # 1.57 insn per cycle - 1.151319170 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.195303e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.674915e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.674915e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.265744 sec + 3,836,823,091 cycles:u # 2.922 GHz (75.03%) + 30,357,682 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.10%) + 857,532,035 stalled-cycles-backend:u # 22.35% backend cycles idle (75.29%) + 3,845,436,342 instructions:u # 1.00 insn per cycle + # 0.22 stalled cycles per insn (75.29%) + 1.331728087 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.430928e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.443345e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.443345e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.772792 sec - 19,550,332,735 cycles # 2.885 GHz - 57,928,238,854 instructions # 2.96 insn per cycle - 6.778111068 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.938856e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951070e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951070e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.602623 sec + 19,677,012,387 cycles:u # 3.498 GHz (74.97%) + 2,756,857 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 3,430,818,000 stalled-cycles-backend:u # 17.44% backend cycles idle (74.97%) + 57,888,320,227 instructions:u # 2.94 insn per cycle + # 0.06 stalled cycles per insn (74.97%) + 5.627403169 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.642090e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.688492e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.688492e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.562259 sec - 10,259,962,003 cycles # 2.883 GHz - 29,997,071,393 instructions # 2.92 insn per cycle - 3.567805037 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.033146e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.084267e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.084267e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.743942 sec + 9,647,917,917 cycles:u # 3.487 GHz (74.85%) + 2,560,334 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.87%) + 2,372,794,359 stalled-cycles-backend:u # 24.59% backend cycles idle (74.95%) + 30,019,587,369 instructions:u # 3.11 insn per cycle + # 0.08 stalled cycles per insn (75.07%) + 2.770430958 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.060333e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.240360e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.240360e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.842606 sec - 4,975,429,359 cycles # 2.695 GHz - 11,262,132,806 instructions # 2.26 insn per cycle - 1.848498494 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.236121e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.257315e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.257315e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.353591 sec + 4,776,745,975 cycles:u # 3.470 GHz (75.08%) + 2,320,254 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.73%) + 1,475,503,524 stalled-cycles-backend:u # 30.89% backend cycles idle (74.73%) + 11,254,402,988 instructions:u # 2.36 insn per cycle + # 0.13 stalled cycles per insn (75.01%) + 1.379911947 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.064837e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.064837e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.605579 sec - 4,356,497,896 cycles # 2.706 GHz - 10,236,092,665 instructions # 2.35 insn per cycle - 1.611218031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.341333e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.457820e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.457820e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.270029 sec - 3,960,771,261 cycles # 1.743 GHz - 5,748,864,563 instructions # 1.45 insn per cycle - 2.275659808 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index e0fcb209a0..c89c4acdab 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_04:58:25 +DATE: 2024-01-31_13:53:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.715814e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.042075e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056833e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470883 sec - 1,939,912,503 cycles # 2.822 GHz - 2,790,884,564 instructions # 1.44 insn per cycle - 0.765236939 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.862441e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.015054e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.020911e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.533285 sec + 1,578,933,550 cycles:u # 2.806 GHz (74.51%) + 7,789,260 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.59%) + 272,593,452 stalled-cycles-backend:u # 17.26% backend cycles idle (75.74%) + 1,818,439,514 instructions:u # 1.15 insn per cycle + # 0.15 stalled cycles per insn (75.78%) + 0.578653790 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.074401e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.309128e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.323134e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.606508 sec - 2,399,848,951 cycles # 2.837 GHz - 3,558,977,452 instructions # 1.48 insn per cycle - 0.907497861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.546447e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.812875e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.817892e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.134405 sec + 3,499,370,566 cycles:u # 2.987 GHz (74.50%) + 21,125,895 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.69%) + 853,546,816 stalled-cycles-backend:u # 24.39% backend cycles idle (74.75%) + 3,209,634,213 instructions:u # 0.92 insn per cycle + # 0.27 stalled cycles per insn (74.74%) + 1.190728884 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418649 -Relative difference = 4.469239988637851e-07 +Avg ME (F77/CUDA) = 1.4131213684418642 +Relative difference = 4.4692399933517674e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.442527e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.455052e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.455052e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.732470 sec - 19,518,863,765 cycles # 2.898 GHz - 57,747,544,085 instructions # 2.96 insn per cycle - 6.739693684 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.937626e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.949805e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.949805e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.600925 sec + 19,689,549,010 cycles:u # 3.502 GHz (74.96%) + 2,204,846 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 2,983,226,749 stalled-cycles-backend:u # 15.15% backend cycles idle (74.96%) + 57,700,780,946 instructions:u # 2.93 insn per cycle + # 0.05 stalled cycles per insn (74.96%) + 5.625400034 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.661123e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.707073e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.707073e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.538386 sec - 10,268,038,737 cycles # 2.898 GHz - 30,334,584,369 instructions # 2.95 insn per cycle - 3.554140482 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.955815e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.005495e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.005495e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.774401 sec + 9,772,730,996 cycles:u # 3.494 GHz (74.88%) + 2,329,022 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.75%) + 2,288,097,457 stalled-cycles-backend:u # 23.41% backend cycles idle (74.89%) + 30,363,583,450 instructions:u # 3.11 insn per cycle + # 0.08 stalled cycles per insn (75.12%) + 2.800409321 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.842618e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.012045e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.012045e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.876874 sec - 5,068,616,518 cycles # 2.693 GHz - 11,664,707,542 instructions # 2.30 insn per cycle - 1.896780245 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.198043e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.217941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.217941e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.390870 sec + 4,932,527,405 cycles:u # 3.490 GHz (74.74%) + 2,111,451 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) + 1,676,406,494 stalled-cycles-backend:u # 33.99% backend cycles idle (75.10%) + 11,671,634,167 instructions:u # 2.37 insn per cycle + # 0.14 stalled cycles per insn (75.10%) + 1.416813284 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.766097e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.969139e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.969139e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.701579 sec - 4,623,474,911 cycles # 2.710 GHz - 10,806,178,257 instructions # 2.34 insn per cycle - 1.712732749 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.261988e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.377447e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.377447e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.282726 sec - 3,962,643,032 cycles # 1.733 GHz - 5,999,265,657 instructions # 1.51 insn per cycle - 2.297742409 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684416484 -Relative difference = 4.469241520660492e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 809c0d4a45..9d749af286 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_04:58:56 +DATE: 2024-01-31_13:54:00 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.450759e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.307242e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.403943e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.453655 sec - 1,885,130,441 cycles # 2.809 GHz - 2,653,723,410 instructions # 1.41 insn per cycle - 0.747134110 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 6.233207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.295589e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.365340e+06 ) sec^-1 +MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 +TOTAL : 0.467908 sec + 1,320,505,833 cycles:u # 2.660 GHz (75.02%) + 8,142,141 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.97%) + 274,045,440 stalled-cycles-backend:u # 20.75% backend cycles idle (75.77%) + 1,686,814,296 instructions:u # 1.28 insn per cycle + # 0.16 stalled cycles per insn (74.73%) + 0.515396159 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.211065e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.390139e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.474767e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.497953 sec - 2,053,184,300 cycles # 2.823 GHz - 2,862,941,904 instructions # 1.39 insn per cycle - 0.785494017 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.299597e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.630371e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635274e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 +TOTAL : 0.959215 sec + 2,923,861,212 cycles:u # 2.949 GHz (74.96%) + 21,308,476 stalled-cycles-frontend:u # 0.73% frontend cycles idle (74.83%) + 855,007,726 stalled-cycles-backend:u # 29.24% backend cycles idle (74.83%) + 2,774,819,087 instructions:u # 0.95 insn per cycle + # 0.31 stalled cycles per insn (74.95%) + 1.011833195 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.619709e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.634289e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.634289e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.276631 sec - 18,176,411,104 cycles # 2.894 GHz - 55,238,700,170 instructions # 3.04 insn per cycle - 6.284146623 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.235316e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.250553e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.250553e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.085842 sec + 17,880,394,593 cycles:u # 3.501 GHz (74.94%) + 2,390,994 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 3,699,282,902 stalled-cycles-backend:u # 20.69% backend cycles idle (74.94%) + 55,296,875,849 instructions:u # 3.09 insn per cycle + # 0.07 stalled cycles per insn (74.96%) + 5.110215731 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.447433e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.602543e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.602543e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.961648 sec - 5,691,843,956 cycles # 2.895 GHz - 16,128,541,176 instructions # 2.83 insn per cycle - 1.980848485 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.080505e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.097747e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.097747e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.537653 sec + 5,440,452,169 cycles:u # 3.487 GHz (74.94%) + 2,164,642 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) + 1,664,978,467 stalled-cycles-backend:u # 30.60% backend cycles idle (74.88%) + 16,160,698,434 instructions:u # 2.97 insn per cycle + # 0.10 stalled cycles per insn (74.88%) + 1.563750461 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.757867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.823085e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.823085e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.954501 sec - 2,591,810,421 cycles # 2.702 GHz - 6,085,915,267 instructions # 2.35 insn per cycle - 0.966912682 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.355325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.435129e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.435129e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.716897 sec + 2,573,702,457 cycles:u # 3.481 GHz (74.68%) + 2,112,380 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) + 824,658,506 stalled-cycles-backend:u # 32.04% backend cycles idle (75.12%) + 6,101,403,334 instructions:u # 2.37 insn per cycle + # 0.14 stalled cycles per insn (75.12%) + 0.742580939 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.986474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.069956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.069956e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.846832 sec - 2,295,114,840 cycles # 2.696 GHz - 5,552,751,365 instructions # 2.42 insn per cycle - 0.861502194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.460942e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.506292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.506292e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.145570 sec - 2,022,184,795 cycles # 1.758 GHz - 3,286,748,929 instructions # 1.63 insn per cycle - 1.163999883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 8f1e29c773..5b3fc6c5c0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_05:48:04 +DATE: 2024-01-31_14:40:39 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.794241e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.099961e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.099961e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.464854 sec - 1,913,128,801 cycles # 2.831 GHz - 2,814,269,280 instructions # 1.47 insn per cycle - 0.735191571 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 3.247862e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.496785e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.496785e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 +TOTAL : 0.504260 sec + 1,461,991,234 cycles:u # 2.779 GHz (74.45%) + 10,944,552 stalled-cycles-frontend:u # 0.75% frontend cycles idle (74.09%) + 252,397,308 stalled-cycles-backend:u # 17.26% backend cycles idle (75.56%) + 1,861,835,376 instructions:u # 1.27 insn per cycle + # 0.14 stalled cycles per insn (76.18%) + 0.550688201 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563056e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.567773e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.567773e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.649992 sec - 2,514,728,119 cycles # 2.840 GHz - 3,857,856,675 instructions # 1.53 insn per cycle - 0.945286461 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.122959e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.467701e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.467701e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 +TOTAL : 1.067243 sec + 3,269,617,170 cycles:u # 2.964 GHz (74.39%) + 30,130,595 stalled-cycles-frontend:u # 0.92% frontend cycles idle (74.58%) + 861,248,742 stalled-cycles-backend:u # 26.34% backend cycles idle (74.61%) + 3,480,018,755 instructions:u # 1.06 insn per cycle + # 0.25 stalled cycles per insn (74.93%) + 1.122752145 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669530965212 +Relative difference = 0.0005401804983001964 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.612332e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.626818e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.626818e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.299552 sec - 18,207,767,275 cycles # 2.889 GHz - 55,242,943,760 instructions # 3.03 insn per cycle - 6.304483382 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.234859e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.250186e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.250186e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.088747 sec + 17,884,436,952 cycles:u # 3.499 GHz (74.96%) + 2,327,651 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) + 3,681,573,463 stalled-cycles-backend:u # 20.59% backend cycles idle (74.96%) + 55,280,013,344 instructions:u # 3.09 insn per cycle + # 0.07 stalled cycles per insn (74.91%) + 5.113324972 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.365917e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.522444e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.522444e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.986287 sec - 5,717,011,577 cycles # 2.873 GHz - 16,175,954,346 instructions # 2.83 insn per cycle - 1.991587162 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.070830e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.087647e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.087647e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.554368 sec + 5,491,975,937 cycles:u # 3.483 GHz (74.77%) + 2,162,040 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.15%) + 1,708,931,547 stalled-cycles-backend:u # 31.12% backend cycles idle (75.14%) + 16,215,779,203 instructions:u # 2.95 insn per cycle + # 0.11 stalled cycles per insn (75.15%) + 1.580187360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129864902818952 -Relative difference = 3.469828399449743e-07 +Avg ME (F77/C++) = 1.4129857118325333 +Relative difference = 2.039421953066926e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.741687e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.807547e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.807547e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.968315 sec - 2,618,792,433 cycles # 2.693 GHz - 6,122,206,815 instructions # 2.34 insn per cycle - 0.973667021 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 2.357776e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.438301e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.438301e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.718857 sec + 2,561,351,001 cycles:u # 3.454 GHz (74.82%) + 2,067,002 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.19%) + 820,609,810 stalled-cycles-backend:u # 32.04% backend cycles idle (75.19%) + 6,136,329,384 instructions:u # 2.40 insn per cycle + # 0.13 stalled cycles per insn (75.20%) + 0.744904138 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.976057e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.060749e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.060749e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.855993 sec - 2,321,654,642 cycles # 2.699 GHz - 5,589,002,861 instructions # 2.41 insn per cycle - 0.861171520 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.455132e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.500155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.500155e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.154878 sec - 2,044,999,339 cycles # 1.765 GHz - 3,327,504,110 instructions # 1.63 insn per cycle - 1.160035358 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 71f99cc0f9..c5078df4b5 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_04:59:21 +DATE: 2024-01-31_13:54:22 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.454028e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.326749e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.426065e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.451891 sec - 1,884,922,304 cycles # 2.826 GHz - 2,675,153,942 instructions # 1.42 insn per cycle - 0.742708600 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 5.163509e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.451426e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.501420e+06 ) sec^-1 +MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 +TOTAL : 0.472511 sec + 1,392,395,013 cycles:u # 2.784 GHz (73.56%) + 8,100,702 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.41%) + 276,496,565 stalled-cycles-backend:u # 19.86% backend cycles idle (74.42%) + 1,721,815,009 instructions:u # 1.24 insn per cycle + # 0.16 stalled cycles per insn (75.78%) + 0.517855915 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.211971e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.383449e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.465566e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.498447 sec - 2,066,378,383 cycles # 2.841 GHz - 2,912,189,828 instructions # 1.41 insn per cycle - 0.785425719 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.325083e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685923e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.691170e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 +TOTAL : 0.958757 sec + 2,928,770,710 cycles:u # 2.956 GHz (75.00%) + 21,347,446 stalled-cycles-frontend:u # 0.73% frontend cycles idle (74.80%) + 855,040,551 stalled-cycles-backend:u # 29.19% backend cycles idle (74.80%) + 2,819,342,051 instructions:u # 0.96 insn per cycle + # 0.30 stalled cycles per insn (74.98%) + 1.011115588 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412608e+00 -Avg ME (F77/CUDA) = 1.4132214346515752 -Relative difference = 0.00043425681546129636 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412404e+00 +Avg ME (F77/CUDA) = 1.4131669531526541 +Relative difference = 0.0005401805380429868 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.621420e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.635929e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.635929e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.272536 sec - 18,133,908,438 cycles # 2.889 GHz - 54,991,536,969 instructions # 3.03 insn per cycle - 6.280002049 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.235914e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.251183e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.251183e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 +TOTAL : 5.084718 sec + 17,882,683,220 cycles:u # 3.502 GHz (74.94%) + 2,375,550 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 2,987,825,837 stalled-cycles-backend:u # 16.71% backend cycles idle (74.94%) + 55,049,728,854 instructions:u # 3.08 insn per cycle + # 0.05 stalled cycles per insn (74.97%) + 5.109246107 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129977771372637 -Relative difference = 1.5772332039074602e-07 +Avg ME (F77/C++) = 1.4129978146120550 +Relative difference = 1.3120184529301602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.675526e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.845155e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.845155e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.910252 sec - 5,541,476,355 cycles # 2.894 GHz - 16,222,950,904 instructions # 2.93 insn per cycle - 1.926546393 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.118880e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.137296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.137296e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 +TOTAL : 1.485425 sec + 5,266,166,102 cycles:u # 3.492 GHz (74.86%) + 2,220,595 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.03%) + 1,527,619,356 stalled-cycles-backend:u # 29.01% backend cycles idle (75.07%) + 16,255,209,513 instructions:u # 3.09 insn per cycle + # 0.09 stalled cycles per insn (75.07%) + 1.511090515 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129863487235070 -Relative difference = 2.4679898241023883e-07 +Avg ME (F77/C++) = 1.4129857712652836 +Relative difference = 1.618803841657786e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.524928e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.573974e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.573974e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.096795 sec - 2,981,881,341 cycles # 2.708 GHz - 6,708,240,605 instructions # 2.25 insn per cycle - 1.109848469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.120936e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.185424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.185424e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 +TOTAL : 0.793589 sec + 2,820,836,759 cycles:u # 3.457 GHz (74.52%) + 2,303,382 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.59%) + 801,753,862 stalled-cycles-backend:u # 28.42% backend cycles idle (74.91%) + 6,748,831,142 instructions:u # 2.39 insn per cycle + # 0.12 stalled cycles per insn (75.34%) + 0.819143454 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 +Avg ME (F77/C++) = 1.4133162680784324 +Relative difference = 1.896804623606238e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.679205e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.738776e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.738776e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.997879 sec - 2,711,169,290 cycles # 2.704 GHz - 6,222,713,478 instructions # 2.30 insn per cycle - 1.012945753 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133158486847037 -Relative difference = 1.0706402269051248e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.374736e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.414577e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.414577e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.216016 sec - 2,159,440,418 cycles # 1.769 GHz - 3,642,249,109 instructions # 1.69 insn per cycle - 1.228978695 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164031689205 -Relative difference = 2.852645271622733e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index c3bf1d184f..bf09fe872a 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_04:59:47 +DATE: 2024-01-31_13:54:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.711100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041363e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.056144e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.470768 sec - 1,937,905,575 cycles # 2.825 GHz - 2,769,085,725 instructions # 1.43 insn per cycle - 0.764309757 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.873525e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.030688e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.036136e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.533324 sec + 1,558,673,708 cycles:u # 2.775 GHz (74.98%) + 7,904,191 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.72%) + 278,417,769 stalled-cycles-backend:u # 17.86% backend cycles idle (75.85%) + 1,806,640,294 instructions:u # 1.16 insn per cycle + # 0.15 stalled cycles per insn (75.92%) + 0.583004824 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.077034e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.312199e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.326400e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.614129 sec - 2,415,403,751 cycles # 2.830 GHz - 3,662,132,699 instructions # 1.52 insn per cycle - 0.915037914 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.613599e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.845121e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850461e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.136287 sec + 3,465,854,034 cycles:u # 2.952 GHz (74.81%) + 21,431,149 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.72%) + 855,206,102 stalled-cycles-backend:u # 24.68% backend cycles idle (75.04%) + 3,115,833,555 instructions:u # 0.90 insn per cycle + # 0.27 stalled cycles per insn (75.48%) + 1.194221843 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.370924e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.382846e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.382846e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.936117 sec - 19,978,394,912 cycles # 2.879 GHz - 59,162,561,873 instructions # 2.96 insn per cycle - 6.944191465 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.885755e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.897495e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.897495e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.701399 sec + 20,036,013,217 cycles:u # 3.501 GHz (74.98%) + 2,942,876 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 3,907,613,044 stalled-cycles-backend:u # 19.50% backend cycles idle (74.98%) + 59,165,739,340 instructions:u # 2.95 insn per cycle + # 0.07 stalled cycles per insn (74.98%) + 5.725948879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.694585e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.741387e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.741387e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.513998 sec - 10,104,341,088 cycles # 2.872 GHz - 29,763,867,436 instructions # 2.95 insn per cycle - 3.532062820 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.095746e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.147754e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.147754e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.711476 sec + 9,548,524,839 cycles:u # 3.492 GHz (74.85%) + 2,483,907 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.86%) + 2,429,422,838 stalled-cycles-backend:u # 25.44% backend cycles idle (74.95%) + 29,789,093,599 instructions:u # 3.12 insn per cycle + # 0.08 stalled cycles per insn (75.07%) + 2.737506225 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.157849e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.336120e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.336120e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.813275 sec - 4,888,809,789 cycles # 2.689 GHz - 11,200,775,616 instructions # 2.29 insn per cycle - 1.831194346 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.242709e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.264174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.264174e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.341938 sec + 4,749,950,097 cycles:u # 3.481 GHz (74.86%) + 2,985,260 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.79%) + 1,573,333,068 stalled-cycles-backend:u # 33.12% backend cycles idle (74.79%) + 11,219,420,879 instructions:u # 2.36 insn per cycle + # 0.14 stalled cycles per insn (74.84%) + 1.367798392 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.059295e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.083013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.083013e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.571072 sec - 4,240,948,322 cycles # 2.691 GHz - 10,146,075,765 instructions # 2.39 insn per cycle - 1.585395140 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.157625e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.268151e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.268151e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.315387 sec - 4,011,221,101 cycles # 1.729 GHz - 5,838,969,816 instructions # 1.46 insn per cycle - 2.328222904 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 0465a21327..cca92229f9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-30_05:00:18 +DATE: 2024-01-31_13:55:09 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.666023e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.032901e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.046936e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.468226 sec - 1,937,873,508 cycles # 2.824 GHz - 2,759,069,461 instructions # 1.42 insn per cycle - 0.754370762 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.872976e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021933e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.027485e+06 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 0.531980 sec + 1,602,270,971 cycles:u # 2.853 GHz (74.22%) + 7,715,458 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.00%) + 261,350,265 stalled-cycles-backend:u # 16.31% backend cycles idle (75.44%) + 1,855,168,819 instructions:u # 1.16 insn per cycle + # 0.14 stalled cycles per insn (75.52%) + 0.579289170 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.070939e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.304690e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.318717e+07 ) sec^-1 -MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.607643 sec - 2,403,195,178 cycles # 2.827 GHz - 3,555,740,714 instructions # 1.48 insn per cycle - 0.909921855 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.600681e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.840185e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.845245e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 +TOTAL : 1.131599 sec + 3,445,297,026 cycles:u # 2.948 GHz (74.64%) + 21,298,915 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.96%) + 851,127,019 stalled-cycles-backend:u # 24.70% backend cycles idle (75.40%) + 3,162,433,332 instructions:u # 0.92 insn per cycle + # 0.27 stalled cycles per insn (75.37%) + 1.190120861 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569487 -Relative difference = 4.418889885423659e-07 +Avg ME (F77/CUDA) = 1.4131213755569483 +Relative difference = 4.4188898885662695e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.404971e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.417275e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.417275e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.840299 sec - 19,736,673,501 cycles # 2.886 GHz - 58,709,690,472 instructions # 2.97 insn per cycle - 6.847451518 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.882570e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.894309e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.894309e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 5.707488 sec + 20,069,251,028 cycles:u # 3.503 GHz (74.97%) + 2,635,071 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 3,505,383,952 stalled-cycles-backend:u # 17.47% backend cycles idle (75.01%) + 58,708,728,731 instructions:u # 2.93 insn per cycle + # 0.06 stalled cycles per insn (75.01%) + 5.732080718 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.708829e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.755468e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.755468e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.503073 sec - 10,118,973,746 cycles # 2.885 GHz - 30,158,905,101 instructions # 2.98 insn per cycle - 3.519090284 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.161494e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.214740e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.214740e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 2.682706 sec + 9,449,572,209 cycles:u # 3.492 GHz (74.88%) + 2,338,158 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) + 2,118,122,832 stalled-cycles-backend:u # 22.42% backend cycles idle (74.89%) + 30,196,934,771 instructions:u # 3.20 insn per cycle + # 0.07 stalled cycles per insn (75.01%) + 2.709003612 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.784663e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.950747e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.950747e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.889159 sec - 5,039,949,395 cycles # 2.661 GHz - 11,663,409,755 instructions # 2.31 insn per cycle - 1.981495827 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.219089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239793e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239793e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 +TOTAL : 1.367376 sec + 4,836,893,280 cycles:u # 3.480 GHz (74.74%) + 2,207,379 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.53%) + 1,569,612,446 stalled-cycles-backend:u # 32.45% backend cycles idle (74.82%) + 11,665,002,188 instructions:u # 2.41 insn per cycle + # 0.13 stalled cycles per insn (75.26%) + 1.393332297 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.838137e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.004758e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.004758e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.689521 sec - 4,555,347,979 cycles # 2.689 GHz - 10,787,640,248 instructions # 2.37 insn per cycle - 1.702819632 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.077813e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.181685e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.181685e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.340568 sec - 4,064,413,524 cycles # 1.733 GHz - 6,073,601,897 instructions # 1.49 insn per cycle - 2.356439472 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213600217192 -Relative difference = 4.5288254008796884e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 53bd28a5bd..65cb87aab5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:00:48 +DATE: 2024-01-31_13:55:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.507010e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.536029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.538733e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.531942 sec - 2,193,462,671 cycles # 2.834 GHz - 3,356,973,773 instructions # 1.53 insn per cycle - 0.849346656 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.342079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.525537e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.526983e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.654104 sec + 1,995,712,276 cycles:u # 2.936 GHz (74.74%) + 2,415,481 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.62%) + 34,360,604 stalled-cycles-backend:u # 1.72% backend cycles idle (75.39%) + 2,158,157,518 instructions:u # 1.08 insn per cycle + # 0.02 stalled cycles per insn (75.40%) + 0.705404958 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.126743e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.160620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.162100e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.043092 sec - 9,489,937,514 cycles # 2.875 GHz - 19,463,317,431 instructions # 2.05 insn per cycle - 3.359518634 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.229184e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.236148e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.236208e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.451568 sec + 29,043,491,099 cycles:u # 3.428 GHz (74.98%) + 11,890,412 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) + 1,124,011,436 stalled-cycles-backend:u # 3.87% backend cycles idle (74.98%) + 22,740,172,566 instructions:u # 0.78 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 8.512180841 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.787937e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.788754e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.788754e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.184792 sec - 26,445,376,310 cycles # 2.879 GHz - 81,759,262,253 instructions # 3.09 insn per cycle - 9.200099621 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.214670e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.215541e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.215541e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.412895 sec + 26,034,504,301 cycles:u # 3.502 GHz (74.94%) + 9,000,011 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) + 3,852,087,436 stalled-cycles-backend:u # 14.80% backend cycles idle (75.02%) + 81,739,960,011 instructions:u # 3.14 insn per cycle + # 0.05 stalled cycles per insn (75.04%) + 7.437382060 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595033e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.598347e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.598347e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.572055 sec - 12,894,491,420 cycles # 2.818 GHz - 39,242,650,330 instructions # 3.04 insn per cycle - 4.588188651 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.984166e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.988749e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.988749e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.298200 sec + 11,606,582,111 cycles:u # 3.495 GHz (74.97%) + 4,624,697 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) + 1,735,045,451 stalled-cycles-backend:u # 14.95% backend cycles idle (74.95%) + 39,243,660,592 instructions:u # 3.38 insn per cycle + # 0.04 stalled cycles per insn (74.95%) + 3.324118895 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.988905e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.005063e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.005063e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.062926 sec - 5,559,157,847 cycles # 2.689 GHz - 13,789,744,695 instructions # 2.48 insn per cycle - 2.079268197 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.191588e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.194143e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194143e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.384178 sec + 4,884,932,429 cycles:u # 3.472 GHz (74.98%) + 3,982,190 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.98%) + 594,600,674 stalled-cycles-backend:u # 12.17% backend cycles idle (74.98%) + 13,805,085,145 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 1.409990161 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.113130e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.134504e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.134504e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.809806 sec - 4,899,980,729 cycles # 2.701 GHz - 12,319,200,932 instructions # 2.51 insn per cycle - 1.824526773 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.926484e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.938620e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.938620e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.377893 sec - 4,078,713,187 cycles # 1.712 GHz - 6,287,612,851 instructions # 1.54 insn per cycle - 2.391138362 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index ba45d149aa..63a4f3691e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:49:05 +DATE: 2024-01-31_14:41:30 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.099881e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.447326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.447326e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520843 sec - 2,128,361,154 cycles # 2.833 GHz - 3,379,769,914 instructions # 1.59 insn per cycle - 0.811374229 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.314396e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.453907e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.453907e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.659716 sec + 1,965,200,705 cycles:u # 2.867 GHz (75.35%) + 2,789,644 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.45%) + 37,818,859 stalled-cycles-backend:u # 1.92% backend cycles idle (75.55%) + 2,155,673,862 instructions:u # 1.10 insn per cycle + # 0.02 stalled cycles per insn (75.18%) + 0.707979793 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.602295e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.096469e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.096469e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.329754 sec - 10,358,756,104 cycles # 2.872 GHz - 22,944,085,739 instructions # 2.21 insn per cycle - 3.663397648 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.208093e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.242810e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.242810e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.553727 sec + 29,338,209,795 cycles:u # 3.410 GHz (75.00%) + 22,527,728 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) + 1,131,409,563 stalled-cycles-backend:u # 3.86% backend cycles idle (75.00%) + 23,563,180,220 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 8.626260229 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.794771e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.795632e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.795632e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.153536 sec - 26,441,951,782 cycles # 2.888 GHz - 81,759,972,796 instructions # 3.09 insn per cycle - 9.158879879 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.220774e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.221672e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.221672e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.396401 sec + 25,960,894,256 cycles:u # 3.499 GHz (74.98%) + 1,712,367 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 3,860,397,243 stalled-cycles-backend:u # 14.87% backend cycles idle (74.98%) + 81,732,223,719 instructions:u # 3.15 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 7.421348797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.577595e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.580993e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.580993e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.598491 sec - 12,916,287,273 cycles # 2.806 GHz - 39,254,753,938 instructions # 3.04 insn per cycle - 4.603937867 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.017866e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.022441e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.022441e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.280001 sec + 11,548,271,811 cycles:u # 3.496 GHz (74.86%) + 851,534 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) + 1,666,698,780 stalled-cycles-backend:u # 14.43% backend cycles idle (75.06%) + 39,231,416,956 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 3.306510540 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.852795e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.869019e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.869019e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.103011 sec - 5,568,678,671 cycles # 2.642 GHz - 13,799,771,926 instructions # 2.48 insn per cycle - 2.108561686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.200156e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.202733e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.202733e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.378042 sec + 4,849,581,934 cycles:u # 3.462 GHz (74.88%) + 1,287,264 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) + 601,990,040 stalled-cycles-backend:u # 12.41% backend cycles idle (74.88%) + 13,817,429,650 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (74.92%) + 1.404318947 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.035305e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.056800e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.056800e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.829572 sec - 4,921,598,332 cycles # 2.684 GHz - 12,328,469,851 instructions # 2.50 insn per cycle - 1.835230648 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.926825e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.939647e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.939647e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.382359 sec - 4,075,002,441 cycles # 1.707 GHz - 6,297,411,526 instructions # 1.55 insn per cycle - 2.387952463 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 2624aa384f..b374ab3593 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_06:01:23 +DATE: 2024-01-31_14:54:05 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.497090e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.524372e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.526818e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.329430e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.511862e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.512447e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.512897 sec - 2,098,983,374 cycles # 2.834 GHz - 3,277,353,449 instructions # 1.56 insn per cycle - 0.803360908 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +TOTAL : 0.651783 sec + 1,962,849,327 cycles:u # 2.904 GHz (75.27%) + 2,413,372 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.16%) + 33,678,909 stalled-cycles-backend:u # 1.72% backend cycles idle (75.16%) + 2,163,070,926 instructions:u # 1.10 insn per cycle + # 0.02 stalled cycles per insn (75.35%) + 0.694638226 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.141054e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.174803e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.176256e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.240071e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.242972e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.243032e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.134385 sec - 9,742,325,189 cycles # 2.872 GHz - 21,219,396,735 instructions # 2.18 insn per cycle - 3.451782991 seconds time elapsed +TOTAL : 8.410192 sec + 28,921,900,074 cycles:u # 3.425 GHz (75.02%) + 11,771,288 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.04%) + 1,138,813,727 stalled-cycles-backend:u # 3.94% backend cycles idle (74.98%) + 22,751,204,493 instructions:u # 0.79 insn per cycle + # 0.05 stalled cycles per insn (74.97%) + 8.471415614 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.789322e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.790133e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.790133e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.223034e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.223927e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.223927e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 9.178912 sec - 26,467,085,384 cycles # 2.885 GHz - 81,758,395,147 instructions # 3.09 insn per cycle - 9.184185479 seconds time elapsed +TOTAL : 7.385145 sec + 25,940,859,836 cycles:u # 3.502 GHz (74.95%) + 1,545,950 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 4,038,405,772 stalled-cycles-backend:u # 15.57% backend cycles idle (74.95%) + 81,778,395,890 instructions:u # 3.15 insn per cycle + # 0.05 stalled cycles per insn (75.00%) + 7.409443065 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.580434e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.583873e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.583873e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.018482e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.023068e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023068e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.592041 sec - 12,908,303,532 cycles # 2.809 GHz - 39,241,301,392 instructions # 3.04 insn per cycle - 4.597199751 seconds time elapsed +TOTAL : 3.275778 sec + 11,537,497,698 cycles:u # 3.498 GHz (74.95%) + 829,315 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 1,692,036,026 stalled-cycles-backend:u # 14.67% backend cycles idle (75.02%) + 39,243,269,509 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 3.300021140 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.006274e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.022952e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.022952e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.191885e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.194427e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.194427e+04 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.060575 sec - 5,561,277,799 cycles # 2.694 GHz - 13,787,529,346 instructions # 2.48 insn per cycle - 2.065507699 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +TOTAL : 1.383808 sec + 4,888,153,917 cycles:u # 3.477 GHz (74.97%) + 3,032,785 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) + 617,814,695 stalled-cycles-backend:u # 12.64% backend cycles idle (74.97%) + 13,805,762,990 instructions:u # 2.82 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 1.407829447 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.108001e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.130506e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.130506e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.812494 sec - 4,903,037,786 cycles # 2.699 GHz - 12,315,866,756 instructions # 2.51 insn per cycle - 1.817504411 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.888313e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.900941e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.900941e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.393126 sec - 4,056,497,728 cycles # 1.692 GHz - 6,284,230,028 instructions # 1.55 insn per cycle - 2.398190383 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 711141aac6..fd6be47ed8 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,223 +1,143 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:57:54 +DATE: 2024-01-31_14:50:55 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.493459e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521487e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.524248e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.509745 sec - 2,103,800,649 cycles # 2.836 GHz - 3,325,789,020 instructions # 1.58 insn per cycle - 0.801087014 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 55,467,149 cycles:u # 2.539 GHz (63.41%) + 45,969 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.41%) + 520,435 stalled-cycles-backend:u # 0.94% backend cycles idle (63.41%) + 43,374,922 instructions:u # 0.78 insn per cycle + # 0.01 stalled cycles per insn (59.26%) + 0.022778455 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.145753e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.180203e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.181702e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.079125 sec - 9,600,126,952 cycles # 2.878 GHz - 21,681,876,510 instructions # 2.26 insn per cycle - 3.393235673 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 50,274,667 cycles:u # 2.342 GHz (62.76%) + 46,338 stalled-cycles-frontend:u # 0.09% frontend cycles idle (62.77%) + 274,830 stalled-cycles-backend:u # 0.55% backend cycles idle (62.77%) + 44,684,416 instructions:u # 0.89 insn per cycle + # 0.01 stalled cycles per insn (64.88%) + 0.022435061 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.797758e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.798583e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.798583e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.135643 sec - 26,454,582,305 cycles # 2.896 GHz - 81,754,058,548 instructions # 3.09 insn per cycle - 9.140745485 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted + 56,504,472 cycles:u # 2.636 GHz (62.71%) + 39,699 stalled-cycles-frontend:u # 0.07% frontend cycles idle (62.72%) + 638,001 stalled-cycles-backend:u # 1.13% backend cycles idle (62.72%) + 42,217,033 instructions:u # 0.75 insn per cycle + # 0.02 stalled cycles per insn (59.30%) + 0.022705545 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.597207e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.600539e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.600539e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.569387 sec - 12,892,653,048 cycles # 2.819 GHz - 39,241,760,724 instructions # 3.04 insn per cycle - 4.574378716 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted + 55,692,026 cycles:u # 2.595 GHz (62.76%) + 42,533 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.76%) + 538,603 stalled-cycles-backend:u # 0.97% backend cycles idle (62.76%) + 39,503,692 instructions:u # 0.71 insn per cycle + # 0.01 stalled cycles per insn (64.30%) + 0.022723669 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.978625e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.995447e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.995447e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.065609 sec - 5,559,302,417 cycles # 2.687 GHz - 13,789,202,442 instructions # 2.48 insn per cycle - 2.071000161 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted + 53,931,580 cycles:u # 2.499 GHz (62.96%) + 45,686 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.96%) + 607,688 stalled-cycles-backend:u # 1.13% backend cycles idle (62.96%) + 40,872,369 instructions:u # 0.76 insn per cycle + # 0.01 stalled cycles per insn (64.59%) + 0.022908456 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.097696e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119952e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119952e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.812868 sec - 4,896,837,509 cycles # 2.695 GHz - 12,317,770,581 instructions # 2.52 insn per cycle - 1.818257681 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.967466e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.979997e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.979997e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.364567 sec - 4,060,623,360 cycles # 1.715 GHz - 6,286,167,500 instructions # 1.55 insn per cycle - 2.369629620 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index de6151d7b3..d2d9dea879 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,226 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:54:28 +DATE: 2024-01-31_14:48:00 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.181803e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.496640e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.499302e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.513215 sec - 2,110,243,378 cycles # 2.841 GHz - 3,364,158,559 instructions # 1.59 insn per cycle - 0.803846009 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.265352e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.394013e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.395125e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.666267 sec + 1,997,556,313 cycles:u # 2.914 GHz (75.49%) + 2,860,510 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.50%) + 33,947,641 stalled-cycles-backend:u # 1.70% backend cycles idle (75.50%) + 2,182,905,169 instructions:u # 1.09 insn per cycle + # 0.02 stalled cycles per insn (75.43%) + 0.710655270 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.724341e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.176977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.178501e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.211617 sec - 9,930,008,722 cycles # 2.863 GHz - 21,629,593,771 instructions # 2.18 insn per cycle - 3.536993543 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.213332e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.245422e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.245484e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.499679 sec + 29,303,278,877 cycles:u # 3.431 GHz (74.92%) + 22,905,012 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) + 1,135,894,583 stalled-cycles-backend:u # 3.88% backend cycles idle (75.00%) + 23,523,863,711 instructions:u # 0.80 insn per cycle + # 0.05 stalled cycles per insn (75.01%) + 8.558405499 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.795958e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.796814e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.796814e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.141774 sec - 26,442,082,623 cycles # 2.892 GHz - 81,755,899,902 instructions # 3.09 insn per cycle - 9.146895276 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.216718e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.217614e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.217614e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.406194 sec + 26,035,489,997 cycles:u # 3.505 GHz (74.96%) + 1,701,351 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 3,983,610,228 stalled-cycles-backend:u # 15.30% backend cycles idle (75.02%) + 81,732,369,026 instructions:u # 3.14 insn per cycle + # 0.05 stalled cycles per insn (75.02%) + 7.430419339 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.584252e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.587667e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.587667e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.585664 sec - 12,903,354,074 cycles # 2.812 GHz - 39,243,037,589 instructions # 3.04 insn per cycle - 4.591083081 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.018118e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.022700e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.022700e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.276016 sec + 11,535,823,383 cycles:u # 3.497 GHz (74.95%) + 846,680 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 1,656,392,067 stalled-cycles-backend:u # 14.36% backend cycles idle (75.02%) + 39,237,814,528 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.02%) + 3.300280855 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.993074e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.009513e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.009513e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.061753 sec - 5,556,410,491 cycles # 2.690 GHz - 13,788,754,708 instructions # 2.48 insn per cycle - 2.066810636 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.202443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.205027e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.205027e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.371746 sec + 4,849,521,512 cycles:u # 3.479 GHz (74.75%) + 702,228 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.75%) + 579,718,051 stalled-cycles-backend:u # 11.95% backend cycles idle (74.79%) + 13,837,586,039 instructions:u # 2.85 insn per cycle + # 0.04 stalled cycles per insn (74.95%) + 1.395711173 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.089775e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.111272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.111272e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.814284 sec - 4,898,229,262 cycles # 2.694 GHz - 12,317,871,193 instructions # 2.51 insn per cycle - 1.819291757 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.893591e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.906421e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.906421e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.392306 sec - 4,056,818,337 cycles # 1.695 GHz - 6,287,135,022 instructions # 1.55 insn per cycle - 2.397424437 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index ce8b9bfd9b..5b91a0822e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:01:27 +DATE: 2024-01-31_13:56:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.464704e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.493618e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.496312e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.530379 sec - 2,191,459,528 cycles # 2.836 GHz - 3,378,194,635 instructions # 1.54 insn per cycle - 0.862349447 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.385794e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.447460e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.448035e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.530879 sec + 1,589,166,725 cycles:u # 2.860 GHz (73.97%) + 2,465,598 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.37%) + 33,426,297 stalled-cycles-backend:u # 2.10% backend cycles idle (75.53%) + 1,838,676,697 instructions:u # 1.16 insn per cycle + # 0.02 stalled cycles per insn (75.54%) + 0.578463301 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.136041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.170363e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.171805e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.033894 sec - 9,468,330,012 cycles # 2.874 GHz - 21,262,061,450 instructions # 2.25 insn per cycle - 3.350179318 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.741198e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.747669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.747790e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 7.025241 sec + 24,087,705,387 cycles:u # 3.411 GHz (74.98%) + 11,687,983 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) + 1,123,050,418 stalled-cycles-backend:u # 4.66% backend cycles idle (74.97%) + 18,923,158,094 instructions:u # 0.79 insn per cycle + # 0.06 stalled cycles per insn (75.02%) + 7.086487861 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158133E-004 -Relative difference = 2.837296512218831e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.798336e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.799213e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.799213e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.130694 sec - 26,439,863,153 cycles # 2.895 GHz - 81,781,637,155 instructions # 3.09 insn per cycle - 9.163718345 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.211410e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.212300e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.212300e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.423786 sec + 26,074,419,907 cycles:u # 3.502 GHz (74.97%) + 10,403,579 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.97%) + 3,413,804,917 stalled-cycles-backend:u # 13.09% backend cycles idle (74.97%) + 81,804,879,740 instructions:u # 3.14 insn per cycle + # 0.04 stalled cycles per insn (74.94%) + 7.448423564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.559639e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.562995e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.562995e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.616847 sec - 12,919,257,236 cycles # 2.796 GHz - 39,249,733,665 instructions # 3.04 insn per cycle - 4.636578065 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.002702e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.007290e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.007290e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.285875 sec + 11,567,697,257 cycles:u # 3.496 GHz (74.88%) + 1,070,210 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) + 1,512,067,030 stalled-cycles-backend:u # 13.07% backend cycles idle (74.93%) + 39,276,843,409 instructions:u # 3.40 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 3.312043369 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.030089e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.046612e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.046612e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.052281 sec - 5,556,604,473 cycles # 2.701 GHz - 13,805,088,947 instructions # 2.48 insn per cycle - 2.071717259 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.190584e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193150e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193150e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.385155 sec + 4,903,114,933 cycles:u # 3.483 GHz (75.00%) + 1,622,594 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) + 598,423,933 stalled-cycles-backend:u # 12.20% backend cycles idle (75.00%) + 13,813,010,627 instructions:u # 2.82 insn per cycle + # 0.04 stalled cycles per insn (75.00%) + 1.410944151 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.135265e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.157006e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.157006e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.804981 sec - 4,885,090,375 cycles # 2.700 GHz - 12,330,030,988 instructions # 2.52 insn per cycle - 1.821790981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.917661e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.930225e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.930225e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.381269 sec - 4,053,625,505 cycles # 1.699 GHz - 6,293,972,632 instructions # 1.55 insn per cycle - 2.398074513 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157309E-004 -Relative difference = 2.837296636563793e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 466f11943e..a9fbe8bd9d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:37:36 +DATE: 2024-01-31_14:22:23 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.224805e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.249067e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.252384e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.539977 sec - 2,169,108,553 cycles # 2.827 GHz - 3,309,870,321 instructions # 1.53 insn per cycle - 0.827427226 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.316764e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.519436e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.520510e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.655608 sec + 1,965,790,092 cycles:u # 2.900 GHz (75.21%) + 2,480,952 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.29%) + 33,557,267 stalled-cycles-backend:u # 1.71% backend cycles idle (75.04%) + 2,195,486,875 instructions:u # 1.12 insn per cycle + # 0.02 stalled cycles per insn (74.43%) + 0.703005371 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.771192e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.799798e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.801021e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.309705 sec - 10,258,793,873 cycles # 2.876 GHz - 23,623,503,831 instructions # 2.30 insn per cycle - 3.624842760 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.236213e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.239204e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.239260e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.496923 sec + 29,197,774,810 cycles:u # 3.429 GHz (75.02%) + 11,878,663 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) + 1,389,627,915 stalled-cycles-backend:u # 4.76% backend cycles idle (74.98%) + 22,780,351,175 instructions:u # 0.78 insn per cycle + # 0.06 stalled cycles per insn (75.00%) + 8.558710646 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.186471e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.186937e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.186937e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.185516 sec - 112,945,518,025 cycles # 2.882 GHz - 141,519,786,794 instructions # 1.25 insn per cycle - 39.190901211 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.563607e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.563978e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.563978e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 35.944813 sec + 126,032,172,389 cycles:u # 3.504 GHz (74.99%) + 88,793,755 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) + 18,058,658,215 stalled-cycles-backend:u # 14.33% backend cycles idle (75.00%) + 141,495,624,349 instructions:u # 1.12 insn per cycle + # 0.13 stalled cycles per insn (75.00%) + 35.969446771 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.072790e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.075243e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.075243e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.347436 sec - 14,950,247,924 cycles # 2.794 GHz - 37,533,141,644 instructions # 2.51 insn per cycle - 5.352716029 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.578797e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.581112e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.581112e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.590582 sec + 16,141,671,363 cycles:u # 3.499 GHz (74.96%) + 6,645,999 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.03%) + 5,778,102,062 stalled-cycles-backend:u # 35.80% backend cycles idle (75.03%) + 37,534,757,291 instructions:u # 2.33 insn per cycle + # 0.15 stalled cycles per insn (75.03%) + 4.616614862 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.349404e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.363561e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.363561e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.242056 sec - 6,032,020,393 cycles # 2.685 GHz - 12,947,712,227 instructions # 2.15 insn per cycle - 2.247452761 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.354714e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.364357e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.364357e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.237879 sec + 7,889,725,427 cycles:u # 3.490 GHz (74.91%) + 9,840,862 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.88%) + 4,384,220,733 stalled-cycles-backend:u # 55.57% backend cycles idle (74.88%) + 12,967,933,363 instructions:u # 1.64 insn per cycle + # 0.34 stalled cycles per insn (74.90%) + 2.263613144 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.895381e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.916043e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.916043e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.857617 sec - 4,999,907,297 cycles # 2.689 GHz - 11,364,404,504 instructions # 2.27 insn per cycle - 1.863061758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.220172e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.234094e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.234094e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.282224 sec - 3,899,980,695 cycles # 1.706 GHz - 5,854,430,419 instructions # 1.50 insn per cycle - 2.287473513 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 5156a1b6a3..06761d6418 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:38:48 +DATE: 2024-01-31_14:23:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.248555e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.273233e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.276174e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.534739 sec - 2,168,342,582 cycles # 2.838 GHz - 3,393,710,794 instructions # 1.57 insn per cycle - 0.822006178 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.356237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.417719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.417952e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.536538 sec + 1,571,606,709 cycles:u # 2.804 GHz (74.36%) + 2,305,832 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.60%) + 38,025,331 stalled-cycles-backend:u # 2.42% backend cycles idle (75.67%) + 1,821,956,477 instructions:u # 1.16 insn per cycle + # 0.02 stalled cycles per insn (75.57%) + 0.583015065 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.787191e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.816239e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.817488e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.282278 sec - 10,172,932,501 cycles # 2.876 GHz - 20,641,658,708 instructions # 2.03 insn per cycle - 3.596374566 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.740825e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.746480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.746592e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 7.028717 sec + 24,122,519,384 cycles:u # 3.413 GHz (75.01%) + 11,778,428 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) + 1,137,368,186 stalled-cycles-backend:u # 4.71% backend cycles idle (75.00%) + 19,029,939,825 instructions:u # 0.79 insn per cycle + # 0.06 stalled cycles per insn (74.99%) + 7.090955914 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158122E-004 -Relative difference = 2.837296513854949e-07 +Avg ME (F77/CUDA) = 6.6266731198158101E-004 +Relative difference = 2.837296517127185e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.152053e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.152498e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.152498e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 39.509903 sec - 113,989,864,763 cycles # 2.886 GHz - 141,709,117,860 instructions # 1.24 insn per cycle - 39.515181315 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 4.559718e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.560092e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.560092e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 35.974834 sec + 126,130,590,034 cycles:u # 3.504 GHz (75.00%) + 24,529,804 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) + 19,190,115,318 stalled-cycles-backend:u # 15.21% backend cycles idle (75.00%) + 141,679,486,407 instructions:u # 1.12 insn per cycle + # 0.14 stalled cycles per insn (75.00%) + 35.999556566 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.077703e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080226e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080226e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.341972 sec - 14,900,472,017 cycles # 2.788 GHz - 37,594,155,695 instructions # 2.52 insn per cycle - 5.347186768 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.638494e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.640842e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.640842e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 4.515199 sec + 15,865,964,729 cycles:u # 3.497 GHz (74.97%) + 4,260,249 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) + 7,290,072,550 stalled-cycles-backend:u # 45.95% backend cycles idle (74.97%) + 37,630,400,088 instructions:u # 2.37 insn per cycle + # 0.19 stalled cycles per insn (74.90%) + 4.541354971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.479123e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.493428e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.493428e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.203003 sec - 5,937,038,542 cycles # 2.690 GHz - 12,831,821,287 instructions # 2.16 insn per cycle - 2.208347742 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.729319e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.740074e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.740074e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.129602 sec + 7,509,265,676 cycles:u # 3.489 GHz (74.86%) + 764,819 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) + 4,248,991,619 stalled-cycles-backend:u # 56.58% backend cycles idle (75.10%) + 12,850,957,521 instructions:u # 1.71 insn per cycle + # 0.33 stalled cycles per insn (75.10%) + 2.155477884 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.959391e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.980227e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.980227e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.840604 sec - 4,989,362,539 cycles # 2.704 GHz - 11,359,801,014 instructions # 2.28 insn per cycle - 1.846082122 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156778E-004 -Relative difference = 2.837296716733571e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.264695e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.278525e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.278525e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.267781 sec - 3,893,427,498 cycles # 1.714 GHz - 5,843,815,532 instructions # 1.50 insn per cycle - 2.273034135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index aecab864cd..096085a906 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:02:06 +DATE: 2024-01-31_13:56:44 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.329622e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.381296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.387810e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.486367 sec - 1,996,254,093 cycles # 2.831 GHz - 2,951,017,935 instructions # 1.48 insn per cycle - 0.792595596 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.416294e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.687411e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.688288e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.442494 sec + 1,237,425,658 cycles:u # 2.645 GHz (74.37%) + 2,811,171 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.38%) + 45,430,504 stalled-cycles-backend:u # 3.67% backend cycles idle (75.91%) + 1,560,108,059 instructions:u # 1.26 insn per cycle + # 0.03 stalled cycles per insn (75.76%) + 0.491868540 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.619469e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.695026e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.698446e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.718352 sec - 5,604,348,056 cycles # 2.870 GHz - 11,484,891,091 instructions # 2.05 insn per cycle - 2.010002941 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.684493e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.714250e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.714674e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.319944 sec + 11,188,825,919 cycles:u # 3.338 GHz (74.87%) + 27,886,740 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.92%) + 1,148,242,085 stalled-cycles-backend:u # 10.26% backend cycles idle (74.94%) + 9,094,703,062 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (74.93%) + 3.374531897 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.963446e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.964435e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.964435e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.362127 sec - 24,202,873,915 cycles # 2.893 GHz - 75,878,244,924 instructions # 3.14 insn per cycle - 8.372784572 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.452205e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.453244e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.453244e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.693914 sec + 23,513,432,599 cycles:u # 3.501 GHz (74.99%) + 1,337,526 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 2,827,462,762 stalled-cycles-backend:u # 12.02% backend cycles idle (74.99%) + 75,882,073,474 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 6.718000010 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.122204e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.135618e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.135618e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.311249 sec - 6,498,315,380 cycles # 2.806 GHz - 20,115,878,445 instructions # 3.10 insn per cycle - 2.327706318 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.889595e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.906996e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.906996e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.664449 sec + 5,885,960,259 cycles:u # 3.489 GHz (74.92%) + 775,103 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) + 868,726,269 stalled-cycles-backend:u # 14.76% backend cycles idle (74.87%) + 20,132,161,373 instructions:u # 3.42 insn per cycle + # 0.04 stalled cycles per insn (74.87%) + 1.690018521 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.585863e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.592266e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.592266e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.042848 sec - 2,820,748,390 cycles # 2.693 GHz - 7,038,277,049 instructions # 2.50 insn per cycle - 1.060611053 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.336602e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.346692e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.346692e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.708050 sec + 2,530,864,472 cycles:u # 3.463 GHz (74.93%) + 722,207 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.83%) + 253,709,095 stalled-cycles-backend:u # 10.02% backend cycles idle (74.83%) + 7,056,928,449 instructions:u # 2.79 insn per cycle + # 0.04 stalled cycles per insn (74.83%) + 0.734034664 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.805764e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.814413e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.814413e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.916917 sec - 2,479,527,909 cycles # 2.691 GHz - 6,280,728,930 instructions # 2.53 insn per cycle - 0.937569165 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.395801e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.400853e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.400853e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.183787 sec - 2,037,112,677 cycles # 1.714 GHz - 3,249,000,234 instructions # 1.59 insn per cycle - 1.203517458 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index cfd5bd9f60..8812663b4a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:49:44 +DATE: 2024-01-31_14:42:06 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.575134e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.304295e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.304295e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.470892 sec - 1,938,590,562 cycles # 2.832 GHz - 2,932,139,577 instructions # 1.51 insn per cycle - 0.742517096 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.511327e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.675698e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.675698e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.445455 sec + 1,282,528,041 cycles:u # 2.719 GHz (74.08%) + 3,250,260 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.46%) + 33,737,049 stalled-cycles-backend:u # 2.63% backend cycles idle (74.58%) + 1,622,372,256 instructions:u # 1.26 insn per cycle + # 0.02 stalled cycles per insn (75.52%) + 0.495132750 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.189558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.483327e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.483327e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.911946 sec - 6,179,624,048 cycles # 2.874 GHz - 12,701,880,125 instructions # 2.06 insn per cycle - 2.209063416 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.253302e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.695076e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.695076e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.453691 sec + 11,542,236,235 cycles:u # 3.309 GHz (75.01%) + 38,052,369 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.03%) + 1,137,356,098 stalled-cycles-backend:u # 9.85% backend cycles idle (75.03%) + 9,943,319,874 instructions:u # 0.86 insn per cycle + # 0.11 stalled cycles per insn (74.97%) + 3.511572406 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.966573e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.967552e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.967552e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.353267 sec - 24,210,307,332 cycles # 2.898 GHz - 75,882,231,103 instructions # 3.13 insn per cycle - 8.358202878 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.451008e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.452057e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.452057e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.699230 sec + 23,539,412,014 cycles:u # 3.502 GHz (74.98%) + 1,325,927 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 2,764,954,908 stalled-cycles-backend:u # 11.75% backend cycles idle (75.01%) + 75,882,013,709 instructions:u # 3.22 insn per cycle + # 0.04 stalled cycles per insn (75.01%) + 6.723781000 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.010932e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.023878e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.023878e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.350780 sec - 6,507,988,967 cycles # 2.764 GHz - 20,124,211,431 instructions # 3.09 insn per cycle - 2.355993372 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.884448e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.902255e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.902255e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.667427 sec + 5,876,182,869 cycles:u # 3.476 GHz (74.92%) + 755,115 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.92%) + 885,630,582 stalled-cycles-backend:u # 15.07% backend cycles idle (74.92%) + 20,136,475,777 instructions:u # 3.43 insn per cycle + # 0.04 stalled cycles per insn (74.93%) + 1.693626257 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.585110e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.591932e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.591932e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.046222 sec - 2,830,060,229 cycles # 2.694 GHz - 7,047,238,365 instructions # 2.49 insn per cycle - 1.051506977 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.805765e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.814390e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.814390e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.919776 sec - 2,488,595,721 cycles # 2.693 GHz - 6,289,461,030 instructions # 2.53 insn per cycle - 0.925186931 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.350229e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.360492e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.360492e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.705971 sec + 2,503,907,768 cycles:u # 3.438 GHz (74.78%) + 805,820 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.74%) + 250,706,028 stalled-cycles-backend:u # 10.01% backend cycles idle (74.74%) + 7,078,577,500 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (74.84%) + 0.731878359 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.390787e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.395884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.395884e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.191044 sec - 2,045,888,825 cycles # 1.712 GHz - 3,258,286,239 instructions # 1.59 insn per cycle - 1.196330024 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index 18818d76f2..b57e941bfc 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_06:02:02 +DATE: 2024-01-31_14:54:41 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.319163e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.372298e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.378244e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.470285 sec - 1,953,187,910 cycles # 2.826 GHz - 2,879,626,230 instructions # 1.47 insn per cycle - 0.750725256 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.484047e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.683097e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.684518e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.462588 sec + 1,263,747,733 cycles:u # 2.709 GHz (75.37%) + 2,821,312 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.27%) + 33,721,485 stalled-cycles-backend:u # 2.67% backend cycles idle (74.04%) + 1,629,313,626 instructions:u # 1.29 insn per cycle + # 0.02 stalled cycles per insn (74.32%) + 0.505592407 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.571852e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.645768e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.649137e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.807519 sec - 5,850,952,354 cycles # 2.861 GHz - 11,909,032,858 instructions # 2.04 insn per cycle - 2.113707665 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.679193e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.711323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.711744e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.315193 sec + 11,143,533,112 cycles:u # 3.330 GHz (74.93%) + 28,071,485 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.90%) + 1,147,162,310 stalled-cycles-backend:u # 10.29% backend cycles idle (74.97%) + 8,994,649,560 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.11%) + 3.366939054 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.964186e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.965186e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.965186e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.453118e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.454148e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.454148e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.362848 sec - 24,219,340,843 cycles # 2.896 GHz - 75,878,803,024 instructions # 3.13 insn per cycle - 8.367752014 seconds time elapsed +TOTAL : 6.691448 sec + 23,508,412,612 cycles:u # 3.502 GHz (74.98%) + 1,325,577 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 2,760,550,099 stalled-cycles-backend:u # 11.74% backend cycles idle (74.98%) + 75,863,951,980 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.98%) + 6.715424956 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.106063e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.119817e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.119817e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 2.317534 sec - 6,502,161,706 cycles # 2.801 GHz - 20,113,148,136 instructions # 3.09 insn per cycle - 2.322610994 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.874463e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.891747e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.891747e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.667003 sec + 5,891,590,353 cycles:u # 3.488 GHz (74.90%) + 696,983 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) + 876,571,432 stalled-cycles-backend:u # 14.88% backend cycles idle (74.90%) + 20,133,226,301 instructions:u # 3.42 insn per cycle + # 0.04 stalled cycles per insn (74.91%) + 1.690910397 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.586948e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.593562e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.593562e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 1.043186 sec - 2,822,730,977 cycles # 2.696 GHz - 7,035,059,102 instructions # 2.49 insn per cycle - 1.048122577 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.807119e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.816011e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.816011e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 -TOTAL : 0.917444 sec - 2,481,419,746 cycles # 2.693 GHz - 6,275,834,953 instructions # 2.53 insn per cycle - 0.922842065 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.358073e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.368273e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.368273e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.701644 sec + 2,504,924,592 cycles:u # 3.462 GHz (74.58%) + 477,115 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.58%) + 242,307,045 stalled-cycles-backend:u # 9.67% backend cycles idle (74.58%) + 7,093,468,748 instructions:u # 2.83 insn per cycle + # 0.03 stalled cycles per insn (74.76%) + 0.725253014 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.399447e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.404609e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.404609e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.182503 sec - 2,042,245,375 cycles # 1.722 GHz - 3,246,419,225 instructions # 1.59 insn per cycle - 1.187753193 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index e0bdb664e1..0f279d1f96 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,223 +1,143 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:58:32 +DATE: 2024-01-31_14:51:10 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.316613e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.368482e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.375052e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.466730 sec - 1,919,349,851 cycles # 2.829 GHz - 2,893,848,641 instructions # 1.51 insn per cycle - 0.736177730 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 53,660,520 cycles:u # 2.471 GHz (63.19%) + 42,812 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.19%) + 627,310 stalled-cycles-backend:u # 1.17% backend cycles idle (63.19%) + 41,361,739 instructions:u # 0.77 insn per cycle + # 0.02 stalled cycles per insn (64.98%) + 0.022613916 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.573214e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.646713e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.650131e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.756106 sec - 5,695,142,145 cycles # 2.868 GHz - 11,326,470,226 instructions # 1.99 insn per cycle - 2.046387591 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted + 44,320,877 cycles:u # 2.057 GHz (62.89%) + 57,125 stalled-cycles-frontend:u # 0.13% frontend cycles idle (62.90%) + 498,199 stalled-cycles-backend:u # 1.12% backend cycles idle (62.90%) + 46,848,203 instructions:u # 1.06 insn per cycle + # 0.01 stalled cycles per insn (69.98%) + 0.022399963 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.965319e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.966317e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.966317e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.353416 sec - 24,206,918,909 cycles # 2.897 GHz - 75,878,282,077 instructions # 3.13 insn per cycle - 8.358253425 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted + 59,645,586 cycles:u # 2.770 GHz (62.87%) + 31,816 stalled-cycles-frontend:u # 0.05% frontend cycles idle (62.88%) + 595,900 stalled-cycles-backend:u # 1.00% backend cycles idle (62.88%) + 37,227,581 instructions:u # 0.62 insn per cycle + # 0.02 stalled cycles per insn (62.87%) + 0.022799665 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.994720e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.007761e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.007761e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.353332 sec - 6,524,875,303 cycles # 2.768 GHz - 20,114,868,262 instructions # 3.08 insn per cycle - 2.358279130 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted + 52,420,208 cycles:u # 2.424 GHz (63.04%) + 35,421 stalled-cycles-frontend:u # 0.07% frontend cycles idle (63.05%) + 608,287 stalled-cycles-backend:u # 1.16% backend cycles idle (63.05%) + 42,179,278 instructions:u # 0.80 insn per cycle + # 0.01 stalled cycles per insn (64.63%) + 0.022892465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.578556e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.585147e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.585147e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.047733 sec - 2,820,818,870 cycles # 2.682 GHz - 7,037,506,961 instructions # 2.49 insn per cycle - 1.053002937 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted + 56,394,214 cycles:u # 2.623 GHz (62.82%) + 44,678 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.82%) + 611,735 stalled-cycles-backend:u # 1.08% backend cycles idle (62.82%) + 42,481,271 instructions:u # 0.75 insn per cycle + # 0.01 stalled cycles per insn (58.94%) + 0.022737225 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.765542e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.773827e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.773827e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.937560 sec - 2,478,872,591 cycles # 2.633 GHz - 6,279,446,291 instructions # 2.53 insn per cycle - 0.942558881 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.394421e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.399630e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.399630e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.184942 sec - 2,037,351,256 cycles # 1.714 GHz - 3,247,924,134 instructions # 1.59 insn per cycle - 1.189828303 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index d4941d3986..98d2ae55ac 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,226 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:55:07 +DATE: 2024-01-31_14:48:36 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.730552e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.395791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401561e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.472770 sec - 1,942,039,449 cycles # 2.839 GHz - 2,914,569,721 instructions # 1.50 insn per cycle - 0.744083634 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.524884e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.683094e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.683954e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 +TOTAL : 0.439268 sec + 1,258,352,153 cycles:u # 2.692 GHz (75.14%) + 3,340,197 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.82%) + 33,637,961 stalled-cycles-backend:u # 2.67% backend cycles idle (74.32%) + 1,655,103,779 instructions:u # 1.32 insn per cycle + # 0.02 stalled cycles per insn (74.30%) + 0.482678047 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.426812e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.621213e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.624728e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.841459 sec - 5,951,272,102 cycles # 2.874 GHz - 12,317,260,326 instructions # 2.07 insn per cycle - 2.133121829 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.282605e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.707454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.707873e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.418285 sec + 11,531,026,630 cycles:u # 3.339 GHz (74.90%) + 38,956,621 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.92%) + 1,147,015,037 stalled-cycles-backend:u # 9.95% backend cycles idle (74.98%) + 9,884,251,524 instructions:u # 0.86 insn per cycle + # 0.12 stalled cycles per insn (74.98%) + 3.471091734 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.960613e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.961562e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.961562e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.374119 sec - 24,216,955,817 cycles # 2.891 GHz - 75,878,033,044 instructions # 3.13 insn per cycle - 8.378947710 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.454097e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.455129e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.455129e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.688804 sec + 23,511,496,407 cycles:u # 3.504 GHz (74.97%) + 1,304,107 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) + 2,796,839,675 stalled-cycles-backend:u # 11.90% backend cycles idle (74.97%) + 75,915,720,981 instructions:u # 3.23 insn per cycle + # 0.04 stalled cycles per insn (74.93%) + 6.712942623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870439686495E-004 -Relative difference = 6.634286759220428e-09 +Avg ME (F77/C++) = 6.6274866115424713E-004 +Relative difference = 5.861309557415831e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.136107e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.149132e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.149132e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.306790 sec - 6,504,696,579 cycles # 2.815 GHz - 20,114,676,918 instructions # 3.09 insn per cycle - 2.311724672 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.876098e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.893495e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.893495e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.666665 sec + 5,888,181,296 cycles:u # 3.487 GHz (74.89%) + 699,454 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.89%) + 877,010,524 stalled-cycles-backend:u # 14.89% backend cycles idle (74.89%) + 20,135,030,726 instructions:u # 3.42 insn per cycle + # 0.04 stalled cycles per insn (74.90%) + 1.690484284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.585387e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.592052e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.592052e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.043340 sec - 2,821,286,489 cycles # 2.694 GHz - 7,037,435,358 instructions # 2.49 insn per cycle - 1.048505999 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.356734e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.366903e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.366903e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.702011 sec + 2,503,642,455 cycles:u # 3.458 GHz (74.59%) + 524,618 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.59%) + 242,217,672 stalled-cycles-backend:u # 9.67% backend cycles idle (74.59%) + 7,096,971,568 instructions:u # 2.83 insn per cycle + # 0.03 stalled cycles per insn (74.75%) + 0.725726716 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.743919e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.751789e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.751789e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.949190 sec - 2,568,265,414 cycles # 2.694 GHz - 6,279,620,229 instructions # 2.45 insn per cycle - 0.954345697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.404393e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.409463e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.409463e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.176805 sec - 2,037,562,738 cycles # 1.726 GHz - 3,247,895,210 instructions # 1.59 insn per cycle - 1.182054069 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 391ab3d24f..a8df518f17 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:02:36 +DATE: 2024-01-31_13:57:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.280133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.331305e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.337921e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487324 sec - 1,984,658,948 cycles # 2.819 GHz - 2,919,152,547 instructions # 1.47 insn per cycle - 0.799038148 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.482914e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674266e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.675640e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.441735 sec + 1,257,000,745 cycles:u # 2.694 GHz (74.18%) + 2,938,235 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.26%) + 34,932,732 stalled-cycles-backend:u # 2.78% backend cycles idle (74.36%) + 1,595,954,562 instructions:u # 1.27 insn per cycle + # 0.02 stalled cycles per insn (75.81%) + 0.487039944 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.572518e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.647175e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.650566e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.731714 sec - 5,664,416,860 cycles # 2.869 GHz - 11,423,818,247 instructions # 2.02 insn per cycle - 2.033192051 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.703269e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.734220e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.734654e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.331567 sec + 11,165,331,380 cycles:u # 3.340 GHz (74.67%) + 27,887,783 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.78%) + 1,142,800,192 stalled-cycles-backend:u # 10.24% backend cycles idle (75.09%) + 9,036,921,384 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.12%) + 3.382507531 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262659968156085E-004 -Relative difference = 2.8371612387547027e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.928583e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.929543e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.929543e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.512607 sec - 24,191,141,745 cycles # 2.843 GHz - 75,807,282,467 instructions # 3.13 insn per cycle - 8.524714483 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.449944e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.450980e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.450980e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 6.699894 sec + 23,548,513,811 cycles:u # 3.503 GHz (75.01%) + 1,267,687 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) + 2,567,037,364 stalled-cycles-backend:u # 10.90% backend cycles idle (75.01%) + 75,796,936,744 instructions:u # 3.22 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 6.724269019 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274870430095556E-004 -Relative difference = 6.489572191632735e-09 +Avg ME (F77/C++) = 6.6274866108667618E-004 +Relative difference = 5.871505118544242e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.113368e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.126874e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.126874e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.313934 sec - 6,500,918,155 cycles # 2.804 GHz - 20,111,364,543 instructions # 3.09 insn per cycle - 2.332783497 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.895398e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.912823e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.912823e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 1.663257 sec + 5,880,803,726 cycles:u # 3.489 GHz (74.90%) + 703,258 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.85%) + 835,914,961 stalled-cycles-backend:u # 14.21% backend cycles idle (74.85%) + 20,173,522,806 instructions:u # 3.43 insn per cycle + # 0.04 stalled cycles per insn (74.71%) + 1.688905666 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274853360924479E-004 -Relative difference = 5.071191384964548e-08 +Avg ME (F77/C++) = 6.6274845946848876E-004 +Relative difference = 6.115670001294808e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.589760e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.596530e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.596530e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.040223 sec - 2,815,442,217 cycles # 2.695 GHz - 7,038,519,370 instructions # 2.50 insn per cycle - 1.057514134 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.350572e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.360751e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.360751e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 0.703636 sec + 2,513,257,412 cycles:u # 3.463 GHz (74.80%) + 1,099,873 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.69%) + 295,163,738 stalled-cycles-backend:u # 11.74% backend cycles idle (74.65%) + 7,089,697,038 instructions:u # 2.82 insn per cycle + # 0.04 stalled cycles per insn (74.31%) + 0.729216779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.751311e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.759469e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.759469e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.945030 sec - 2,478,506,957 cycles # 2.610 GHz - 6,280,796,881 instructions # 2.53 insn per cycle - 0.988336476 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627193e-04 -Avg ME (F77/C++) = 6.6271927529261421E-004 -Relative difference = 3.728182620967159e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271947045332125E-004 +Relative difference = 4.4583988847766445e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.386273e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.391271e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.391271e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.191730 sec - 2,039,311,665 cycles # 1.704 GHz - 3,248,072,614 instructions # 1.59 insn per cycle - 1.208300824 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952818273971E-004 -Relative difference = 4.252589469696448e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 77eae3ae9c..c6f116f62e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:40:00 +DATE: 2024-01-31_14:24:42 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.547321e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.587547e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.593359e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.493771 sec - 2,067,778,848 cycles # 2.808 GHz - 3,079,367,454 instructions # 1.49 insn per cycle - 0.793803934 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.493346e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.687069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.687946e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.440319 sec + 1,273,508,778 cycles:u # 2.739 GHz (73.60%) + 2,778,353 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.20%) + 35,103,722 stalled-cycles-backend:u # 2.76% backend cycles idle (75.55%) + 1,611,965,427 instructions:u # 1.27 insn per cycle + # 0.02 stalled cycles per insn (75.74%) + 0.486403940 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.730139e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.790957e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.793762e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.861027 sec - 6,035,258,548 cycles # 2.873 GHz - 13,088,532,370 instructions # 2.17 insn per cycle - 2.157681364 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.682520e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.715611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.716036e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.338845 sec + 11,168,920,412 cycles:u # 3.333 GHz (74.87%) + 27,880,455 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.95%) + 1,145,991,851 stalled-cycles-backend:u # 10.26% backend cycles idle (74.94%) + 9,103,853,238 instructions:u # 0.82 insn per cycle + # 0.13 stalled cycles per insn (74.93%) + 3.392372577 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.418156e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.418889e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.418889e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 30.278912 sec - 87,193,893,967 cycles # 2.880 GHz - 133,999,567,781 instructions # 1.54 insn per cycle - 30.284052553 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.267558e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.268235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.268235e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 26.172768 sec + 91,809,761,052 cycles:u # 3.505 GHz (74.99%) + 520,748,208 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.99%) + 7,042,480,285 stalled-cycles-backend:u # 7.67% backend cycles idle (74.99%) + 134,080,128,680 instructions:u # 1.46 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 26.197030587 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275354356437610E-004 -Relative difference = 6.573239683366044e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627534e-04 +Avg ME (F77/C++) = 6.6275340697351248E-004 +Relative difference = 1.052203199451665e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.858617e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.871131e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.871131e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.400232 sec - 6,719,203,240 cycles # 2.795 GHz - 19,163,412,782 instructions # 2.85 insn per cycle - 2.405407499 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.369298e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.382288e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.382288e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.965775 sec + 6,952,016,013 cycles:u # 3.497 GHz (74.84%) + 661,883 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 3,318,805,188 stalled-cycles-backend:u # 47.74% backend cycles idle (75.05%) + 19,180,596,065 instructions:u # 2.76 insn per cycle + # 0.17 stalled cycles per insn (75.05%) + 1.991242612 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859783433532E-004 -Relative difference = 3.2677016209485094e-09 +Avg ME (F77/C++) = 6.6274857053714997E-004 +Relative difference = 4.445554471174176e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.418642e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.423893e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.423893e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.168526 sec - 3,140,858,608 cycles # 2.683 GHz - 6,747,205,943 instructions # 2.15 insn per cycle - 1.173847287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.458741e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.462654e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.462654e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.130561 sec + 4,016,590,780 cycles:u # 3.483 GHz (74.97%) + 536,332 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) + 2,228,245,775 stalled-cycles-backend:u # 55.48% backend cycles idle (75.03%) + 6,764,434,762 instructions:u # 1.68 insn per cycle + # 0.33 stalled cycles per insn (75.03%) + 1.157743533 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735722101156E-004 +Relative difference = 6.454990161554483e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.703185e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.710717e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.710717e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.972109 sec - 2,610,520,883 cycles # 2.675 GHz - 5,931,408,487 instructions # 2.27 insn per cycle - 0.977161465 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724143469353E-004 -Relative difference = 6.252149235286529e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.380375e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.385342e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.385342e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.197476 sec - 2,050,152,648 cycles # 1.706 GHz - 3,435,996,672 instructions # 1.68 insn per cycle - 1.202741015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272748295826550E-004 -Relative difference = 2.5714542480216212e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index 0e738d355a..e09e89969a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:40:55 +DATE: 2024-01-31_14:25:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.495403e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.535166e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.540654e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.492553 sec - 2,044,553,803 cycles # 2.834 GHz - 3,023,997,415 instructions # 1.48 insn per cycle - 0.781011529 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 2.469687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672014e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672626e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 +TOTAL : 0.443251 sec + 1,282,818,741 cycles:u # 2.745 GHz (73.89%) + 2,895,680 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.09%) + 33,452,918 stalled-cycles-backend:u # 2.61% backend cycles idle (74.34%) + 1,622,110,235 instructions:u # 1.26 insn per cycle + # 0.02 stalled cycles per insn (75.15%) + 0.490883722 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.639095e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.697524e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.700186e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.866900 sec - 6,069,227,607 cycles # 2.871 GHz - 11,631,061,560 instructions # 1.92 insn per cycle - 2.173672620 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.697685e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.729954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.730376e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 +TOTAL : 3.310115 sec + 11,125,820,088 cycles:u # 3.330 GHz (74.83%) + 28,002,737 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.87%) + 1,144,368,068 stalled-cycles-backend:u # 10.29% backend cycles idle (75.15%) + 8,957,167,166 instructions:u # 0.81 insn per cycle + # 0.13 stalled cycles per insn (75.24%) + 3.362032400 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626454e-04 -Avg ME (F77/CUDA) = 6.6262660579844562E-004 -Relative difference = 2.836238137986709e-05 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626791e-04 +Avg ME (F77/CUDA) = 6.6270899361878938E-004 +Relative difference = 4.511024836808726e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.528053e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.528817e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.528817e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 29.676151 sec - 85,692,453,161 cycles # 2.888 GHz - 134,120,579,675 instructions # 1.57 insn per cycle - 29.681167734 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.233355e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.234020e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.234020e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 +TOTAL : 26.316160 sec + 92,312,051,453 cycles:u # 3.505 GHz (74.99%) + 451,235,615 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.00%) + 7,117,997,834 stalled-cycles-backend:u # 7.71% backend cycles idle (75.00%) + 134,002,336,312 instructions:u # 1.45 insn per cycle + # 0.05 stalled cycles per insn (75.00%) + 26.340478391 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627536e-04 -Avg ME (F77/C++) = 6.6275357377482830E-004 -Relative difference = 3.95700176737784e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275346486299042E-004 +Relative difference = 5.301670926116898e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.924333e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.936823e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.936823e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 -TOTAL : 2.377362 sec - 6,721,293,685 cycles # 2.823 GHz - 19,223,635,236 instructions # 2.86 insn per cycle - 2.382317911 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.419571e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.432174e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.432174e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 +TOTAL : 1.953922 sec + 6,896,140,793 cycles:u # 3.489 GHz (74.95%) + 725,375 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) + 3,060,527,991 stalled-cycles-backend:u # 44.38% backend cycles idle (74.91%) + 19,245,542,364 instructions:u # 2.79 insn per cycle + # 0.16 stalled cycles per insn (74.91%) + 1.979830500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274859765498573E-004 -Relative difference = 3.538316437387639e-09 +Avg ME (F77/C++) = 6.6274857044990032E-004 +Relative difference = 4.4587192899226015e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.449646e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.455242e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.455242e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.140025 sec - 3,079,658,771 cycles # 2.692 GHz - 6,686,222,708 instructions # 2.17 insn per cycle - 1.145080651 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.500433e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.504571e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.504571e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 +TOTAL : 1.099115 sec + 3,910,134,327 cycles:u # 3.486 GHz (74.91%) + 558,653 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) + 2,182,277,989 stalled-cycles-backend:u # 55.81% backend cycles idle (75.04%) + 6,705,411,876 instructions:u # 1.71 insn per cycle + # 0.33 stalled cycles per insn (75.04%) + 1.124840402 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627274e-04 +Avg ME (F77/C++) = 6.6272735755491807E-004 +Relative difference = 6.404606472340801e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.717993e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.725785e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.725785e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.963197 sec - 2,607,305,399 cycles # 2.696 GHz - 5,935,632,787 instructions # 2.28 insn per cycle - 0.968307475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627272e-04 -Avg ME (F77/C++) = 6.6272724133897148E-004 -Relative difference = 6.237705578619894e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.382587e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.387561e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.387561e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.195178 sec - 2,050,651,524 cycles # 1.710 GHz - 3,422,960,187 instructions # 1.67 insn per cycle - 1.200266882 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272749650985591E-004 -Relative difference = 5.26633351741962e-09 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 7714401e20..d26c28a736 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:03:07 +DATE: 2024-01-31_13:57:39 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.456900e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.484722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487399e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.528831 sec - 2,192,645,567 cycles # 2.833 GHz - 3,378,106,633 instructions # 1.54 insn per cycle - 0.861052908 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.354215e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.516811e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.517858e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.655808 sec + 1,958,889,401 cycles:u # 2.870 GHz (75.38%) + 2,412,382 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.40%) + 37,254,762 stalled-cycles-backend:u # 1.90% backend cycles idle (75.03%) + 2,178,606,509 instructions:u # 1.11 insn per cycle + # 0.02 stalled cycles per insn (75.16%) + 0.708127648 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.113905e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.147620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.149017e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.049267 sec - 9,507,642,735 cycles # 2.871 GHz - 19,066,132,971 instructions # 2.01 insn per cycle - 3.371164553 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.238397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.241190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.241245e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 8.397194 sec + 28,888,354,559 cycles:u # 3.424 GHz (75.01%) + 11,624,614 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) + 1,121,527,055 stalled-cycles-backend:u # 3.88% backend cycles idle (75.02%) + 22,649,300,700 instructions:u # 0.78 insn per cycle + # 0.05 stalled cycles per insn (75.06%) + 8.457715515 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.769606e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.770411e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.770411e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.278224 sec - 26,812,823,901 cycles # 2.889 GHz - 82,462,709,559 instructions # 3.08 insn per cycle - 9.289930135 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.162585e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.163416e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.163416e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.591256 sec + 26,650,428,737 cycles:u # 3.501 GHz (74.99%) + 44,584,150 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.99%) + 3,994,903,658 stalled-cycles-backend:u # 14.99% backend cycles idle (74.99%) + 82,451,104,184 instructions:u # 3.09 insn per cycle + # 0.05 stalled cycles per insn (74.99%) + 7.615919588 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.509625e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.512894e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.512894e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.686363 sec - 12,638,766,565 cycles # 2.696 GHz - 38,538,047,706 instructions # 3.05 insn per cycle - 4.708715306 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.081118e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.085777e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.085777e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.235482 sec + 11,379,481,573 cycles:u # 3.493 GHz (74.96%) + 3,503,367 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.96%) + 1,193,771,329 stalled-cycles-backend:u # 10.49% backend cycles idle (74.96%) + 38,541,307,073 instructions:u # 3.39 insn per cycle + # 0.03 stalled cycles per insn (74.96%) + 3.261507264 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.005037e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.021640e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.021640e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.058850 sec - 5,538,789,085 cycles # 2.684 GHz - 13,583,257,196 instructions # 2.45 insn per cycle - 2.079297542 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.217108e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.219788e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.219788e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.355326 sec + 4,803,691,181 cycles:u # 3.486 GHz (74.91%) + 846,490 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) + 519,948,195 stalled-cycles-backend:u # 10.82% backend cycles idle (75.04%) + 13,592,579,448 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 1.381084672 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.175649e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.196938e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.196938e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.797590 sec - 4,843,535,516 cycles # 2.687 GHz - 12,110,039,110 instructions # 2.50 insn per cycle - 1.813279758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.862805e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.874864e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.874864e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.399875 sec - 4,096,013,404 cycles # 1.704 GHz - 6,283,624,620 instructions # 1.53 insn per cycle - 2.418716991 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 9cdb5ea5b9..7f51395c68 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-30_05:03:46 +DATE: 2024-01-31_13:58:15 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.463401e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491582e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.494105e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.528345 sec - 2,191,366,155 cycles # 2.835 GHz - 3,376,981,873 instructions # 1.54 insn per cycle - 0.868249282 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.358889e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423680e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423885e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.535324 sec + 1,548,672,373 cycles:u # 2.766 GHz (75.36%) + 2,323,136 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.39%) + 50,258,647 stalled-cycles-backend:u # 3.25% backend cycles idle (75.85%) + 1,831,558,733 instructions:u # 1.18 insn per cycle + # 0.03 stalled cycles per insn (75.10%) + 0.583324308 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.141311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.175321e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.176779e+05 ) sec^-1 -MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.034775 sec - 9,461,569,572 cycles # 2.871 GHz - 21,570,730,622 instructions # 2.28 insn per cycle - 3.354365055 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.738857e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.744595e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.744713e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 7.034363 sec + 24,120,218,080 cycles:u # 3.411 GHz (74.97%) + 11,414,117 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) + 1,120,074,241 stalled-cycles-backend:u # 4.64% backend cycles idle (75.00%) + 18,957,844,450 instructions:u # 0.79 insn per cycle + # 0.06 stalled cycles per insn (75.04%) + 7.093773300 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.763986e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.764820e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.764820e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 9.308542 sec - 26,818,191,963 cycles # 2.880 GHz - 82,362,969,124 instructions # 3.07 insn per cycle - 9.331807277 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.200217e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.201081e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.201081e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 7.461298 sec + 26,216,002,251 cycles:u # 3.503 GHz (74.99%) + 8,570,456 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.99%) + 3,485,137,026 stalled-cycles-backend:u # 13.29% backend cycles idle (74.99%) + 82,338,842,956 instructions:u # 3.14 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 7.485833169 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.494755e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.497969e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.497969e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.702744 sec - 12,651,856,685 cycles # 2.688 GHz - 38,557,643,348 instructions # 3.05 insn per cycle - 4.723006762 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.063768e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.068407e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.068407e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 3.246351 sec + 11,439,475,472 cycles:u # 3.499 GHz (74.89%) + 5,196,887 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) + 1,364,315,247 stalled-cycles-backend:u # 11.93% backend cycles idle (75.04%) + 38,562,841,783 instructions:u # 3.37 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 3.272390742 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.057026e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.073448e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.073448e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.045356 sec - 5,503,322,263 cycles # 2.685 GHz - 13,599,131,001 instructions # 2.47 insn per cycle - 2.065937163 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.213273e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215906e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.215906e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.359387 sec + 4,814,914,164 cycles:u # 3.484 GHz (74.73%) + 701,661 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) + 488,467,217 stalled-cycles-backend:u # 10.14% backend cycles idle (75.08%) + 13,606,707,744 instructions:u # 2.83 insn per cycle + # 0.04 stalled cycles per insn (75.11%) + 1.386481494 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.173965e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.195231e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.195231e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.797623 sec - 4,836,406,491 cycles # 2.684 GHz - 12,123,840,407 instructions # 2.51 insn per cycle - 1.816744592 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.872297e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.884618e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.884618e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.396265 sec - 4,088,419,794 cycles # 1.703 GHz - 6,289,480,909 instructions # 1.54 insn per cycle - 2.414194012 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276836E-004 -Relative difference = 2.9563428359824236e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 10dc25694a..362d9e06ac 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:06:15 +DATE: 2024-01-31_14:00:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.064289e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.064686e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.064874e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.459219 sec - 7,914,579,350 cycles # 2.876 GHz - 17,414,362,649 instructions # 2.20 insn per cycle - 2.856648920 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.905901e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.911485e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.911576e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.522044 sec + 32,979,124,021 cycles:u # 3.453 GHz (74.97%) + 3,578,135 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 7,104,095 stalled-cycles-backend:u # 0.02% backend cycles idle (75.03%) + 26,013,812,715 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.574363198 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.261836e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.264181e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.264456e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.001930 sec - 12,466,660,301 cycles # 2.881 GHz - 28,598,806,424 instructions # 2.29 insn per cycle - 4.385332309 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.470769e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.474626e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.474648e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.160361 sec + 31,690,243,585 cycles:u # 3.450 GHz (74.99%) + 3,785,612 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 60,055,016 stalled-cycles-backend:u # 0.19% backend cycles idle (75.01%) + 25,048,526,645 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 9.207355038 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.667093e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.667308e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.667308e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.887821 sec - 18,997,365,246 cycles # 2.759 GHz - 55,182,817,229 instructions # 2.90 insn per cycle - 6.894930966 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.019899e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.019928e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.019928e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.178513 sec + 18,195,723,872 cycles:u # 3.499 GHz (74.93%) + 30,630,770 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.94%) + 2,065,218,530 stalled-cycles-backend:u # 11.35% backend cycles idle (74.99%) + 55,194,104,763 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 5.202702937 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.565125e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565211e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565211e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.384131 sec - 9,789,568,447 cycles # 2.893 GHz - 27,057,217,068 instructions # 2.76 insn per cycle - 3.398188002 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.232094e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.232222e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.232222e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.367099 sec + 8,357,225,941 cycles:u # 3.498 GHz (74.92%) + 1,713,232 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) + 783,489,125 stalled-cycles-backend:u # 9.37% backend cycles idle (74.89%) + 27,125,618,956 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (74.81%) + 2.392556781 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.331784e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332213e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332213e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.593056 sec - 4,251,132,724 cycles # 2.667 GHz - 9,566,982,441 instructions # 2.25 insn per cycle - 1.603318722 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.162576e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.163242e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.163242e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.024351 sec + 3,648,814,216 cycles:u # 3.487 GHz (74.85%) + 1,279,142 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.77%) + 302,436,489 stalled-cycles-backend:u # 8.29% backend cycles idle (74.77%) + 9,598,531,237 instructions:u # 2.63 insn per cycle + # 0.03 stalled cycles per insn (74.77%) + 1.049480308 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.782288e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.782847e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.782847e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.405349 sec - 3,719,980,949 cycles # 2.646 GHz - 8,451,730,597 instructions # 2.27 insn per cycle - 1.418908281 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.332107e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.332611e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.332611e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.593467 sec - 2,690,971,905 cycles # 1.687 GHz - 4,249,909,932 instructions # 1.58 insn per cycle - 1.609272621 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 14598d99fd..316f7c5721 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:50:15 +DATE: 2024-01-31_14:42:34 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.062580e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.063573e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.063573e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.393250 sec - 7,805,787,223 cycles # 2.878 GHz - 17,759,546,689 instructions # 2.28 insn per cycle - 2.771767839 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.903771e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.904490e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.904490e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.602069 sec + 33,267,707,940 cycles:u # 3.455 GHz (74.99%) + 3,635,257 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 7,352,981 stalled-cycles-backend:u # 0.02% backend cycles idle (75.01%) + 26,291,061,699 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 9.654521726 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.205412e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.241153e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.241153e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.999133 sec - 12,487,046,648 cycles # 2.887 GHz - 29,181,392,973 instructions # 2.34 insn per cycle - 4.379902707 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.467512e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.471029e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.471029e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.169872 sec + 31,755,546,975 cycles:u # 3.453 GHz (74.97%) + 4,329,583 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 57,451,118 stalled-cycles-backend:u # 0.18% backend cycles idle (74.97%) + 25,082,541,299 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.00%) + 9.216880999 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.924049e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.924280e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.924280e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.668799 sec - 18,978,883,548 cycles # 2.845 GHz - 55,181,310,686 instructions # 2.91 insn per cycle - 6.673958990 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.021926e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.021953e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.021953e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.168329 sec + 18,190,353,086 cycles:u # 3.505 GHz (74.91%) + 29,000,533 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) + 2,110,974,888 stalled-cycles-backend:u # 11.60% backend cycles idle (75.03%) + 55,195,082,247 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (75.03%) + 5.192332090 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.558442e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.558530e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.558530e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.394020 sec - 9,815,752,501 cycles # 2.889 GHz - 27,056,612,659 instructions # 2.76 insn per cycle - 3.399148950 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.228820e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.228947e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.228947e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.370786 sec + 8,357,476,280 cycles:u # 3.492 GHz (74.93%) + 1,525,757 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) + 763,908,253 stalled-cycles-backend:u # 9.14% backend cycles idle (74.93%) + 27,111,418,735 instructions:u # 3.24 insn per cycle + # 0.03 stalled cycles per insn (74.84%) + 2.396266638 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.345002e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345461e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345461e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.587609 sec - 4,248,692,453 cycles # 2.674 GHz - 9,567,437,136 instructions # 2.25 insn per cycle - 1.592590793 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 5.231422e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.232115e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.232115e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.011154 sec + 3,602,330,298 cycles:u # 3.485 GHz (74.57%) + 908,225 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.66%) + 243,683,993 stalled-cycles-backend:u # 6.76% backend cycles idle (75.01%) + 9,599,377,384 instructions:u # 2.66 insn per cycle + # 0.03 stalled cycles per insn (75.23%) + 1.036728920 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.873515e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.874138e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.874138e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.369431 sec - 3,692,449,005 cycles # 2.689 GHz - 8,450,968,058 instructions # 2.29 insn per cycle - 1.374284426 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.369341e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.369854e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.369854e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.574041 sec - 2,686,211,452 cycles # 1.702 GHz - 4,249,274,815 instructions # 1.58 insn per cycle - 1.579137128 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 869fccfa2f..1d53f5cee3 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:07:21 +DATE: 2024-01-31_14:01:57 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.062893e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.063296e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.063519e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.463696 sec - 7,904,124,830 cycles # 2.867 GHz - 17,962,469,169 instructions # 2.27 insn per cycle - 2.863348110 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.848562e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.854180e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.854240e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.652377 sec + 33,505,735,625 cycles:u # 3.461 GHz (74.93%) + 3,572,159 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 8,527,968 stalled-cycles-backend:u # 0.03% backend cycles idle (75.01%) + 26,418,167,401 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.02%) + 9.700899936 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.275434e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.277655e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.278066e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.004406 sec - 12,472,043,203 cycles # 2.872 GHz - 27,476,431,943 instructions # 2.20 insn per cycle - 4.397866058 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.481649e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485248e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.485281e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.149586 sec + 31,710,895,432 cycles:u # 3.455 GHz (74.97%) + 4,043,797 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 45,699,251 stalled-cycles-backend:u # 0.14% backend cycles idle (75.00%) + 25,027,683,835 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.04%) + 9.197900477 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406640E-003 -Relative difference = 3.5164777671934515e-07 +Avg ME (F77/CUDA) = 9.8722595284406710E-003 +Relative difference = 3.516477760164775e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.993181e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.993429e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.993429e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.616864 sec - 18,937,214,023 cycles # 2.863 GHz - 55,162,675,285 instructions # 2.91 insn per cycle - 6.624084944 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.024961e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.024989e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.024989e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.153075 sec + 18,122,203,390 cycles:u # 3.502 GHz (74.96%) + 29,439,456 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.96%) + 2,282,705,716 stalled-cycles-backend:u # 12.60% backend cycles idle (74.96%) + 55,175,170,632 instructions:u # 3.04 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 5.177202891 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.560244e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.560337e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.560337e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.391033 sec - 9,810,909,577 cycles # 2.891 GHz - 27,064,931,751 instructions # 2.76 insn per cycle - 3.404410372 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.235560e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.235688e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.235688e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.362983 sec + 8,327,226,063 cycles:u # 3.491 GHz (74.85%) + 2,207,034 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.87%) + 817,488,808 stalled-cycles-backend:u # 9.82% backend cycles idle (74.94%) + 27,104,075,495 instructions:u # 3.25 insn per cycle + # 0.03 stalled cycles per insn (75.09%) + 2.388617027 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.366743e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.367151e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.367151e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.577213 sec - 4,241,194,499 cycles # 2.687 GHz - 9,570,392,055 instructions # 2.26 insn per cycle - 1.590680511 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.166153e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.166819e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.166819e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 1.023339 sec + 3,644,451,901 cycles:u # 3.486 GHz (74.84%) + 997,870 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.75%) + 269,479,223 stalled-cycles-backend:u # 7.39% backend cycles idle (74.75%) + 9,606,641,224 instructions:u # 2.64 insn per cycle + # 0.03 stalled cycles per insn (74.79%) + 1.048702145 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.823083e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.823621e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.823621e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.389663 sec - 3,742,544,913 cycles # 2.690 GHz - 8,455,558,047 instructions # 2.26 insn per cycle - 1.401942381 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.367545e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.368096e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.368096e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.578445 sec - 2,686,793,480 cycles # 1.702 GHz - 4,251,847,609 instructions # 1.58 insn per cycle - 1.591347897 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722595285411531E-003 -Relative difference = 3.516375977906115e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index a75bd83e48..bfc75014f0 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:08:27 +DATE: 2024-01-31_14:03:39 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.769847e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.770754e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.771164e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.703094 sec - 5,571,181,653 cycles # 2.867 GHz - 11,974,166,174 instructions # 2.15 insn per cycle - 2.057946232 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.756123e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.759235e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.759270e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 +TOTAL : 4.678163 sec + 16,049,952,313 cycles:u # 3.411 GHz (74.99%) + 2,790,953 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) + 6,464,230 stalled-cycles-backend:u # 0.04% backend cycles idle (74.90%) + 13,048,590,801 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.89%) + 4.728278426 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.318486e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.319261e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.319430e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.904733 sec - 6,266,697,659 cycles # 2.868 GHz - 13,596,680,456 instructions # 2.17 insn per cycle - 2.241129899 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.158642e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.173754e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.173876e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 +TOTAL : 4.796927 sec + 16,485,997,216 cycles:u # 3.418 GHz (74.95%) + 3,094,127 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) + 52,025,079 stalled-cycles-backend:u # 0.32% backend cycles idle (74.85%) + 13,325,972,509 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.91%) + 4.844549271 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.651013e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.651286e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.651286e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.115216 sec - 17,580,950,028 cycles # 2.876 GHz - 51,788,424,956 instructions # 2.95 insn per cycle - 6.122234952 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.095569e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095603e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095603e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.820874 sec + 16,955,395,720 cycles:u # 3.501 GHz (74.90%) + 14,012,697 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.92%) + 1,919,969,828 stalled-cycles-backend:u # 11.32% backend cycles idle (75.00%) + 51,814,509,039 instructions:u # 3.06 insn per cycle + # 0.04 stalled cycles per insn (75.06%) + 4.845302656 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.365857e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.366295e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.366295e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.576617 sec - 4,544,162,423 cycles # 2.878 GHz - 13,760,085,205 instructions # 3.03 insn per cycle - 1.587566374 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.577122e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.577657e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.577657e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.155184 sec + 4,095,286,029 cycles:u # 3.479 GHz (74.86%) + 845,198 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.86%) + 406,702,774 stalled-cycles-backend:u # 9.93% backend cycles idle (74.86%) + 13,777,565,579 instructions:u # 3.36 insn per cycle + # 0.03 stalled cycles per insn (74.89%) + 1.180459682 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.652038e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.653755e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.653755e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.803941 sec - 2,147,173,176 cycles # 2.667 GHz - 4,827,637,015 instructions # 2.25 insn per cycle - 0.818354401 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.020655e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.020912e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.020912e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.518680 sec + 1,875,512,901 cycles:u # 3.467 GHz (75.01%) + 1,293,148 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.87%) + 191,096,015 stalled-cycles-backend:u # 10.19% backend cycles idle (74.87%) + 4,860,923,155 instructions:u # 2.59 insn per cycle + # 0.04 stalled cycles per insn (74.87%) + 0.543969116 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.264093e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.266084e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.266084e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.735161 sec - 1,890,652,826 cycles # 2.565 GHz - 4,260,215,320 instructions # 2.25 insn per cycle - 0.752160652 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.595587e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.597618e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.597618e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.809620 sec - 1,357,631,253 cycles # 1.673 GHz - 2,149,171,041 instructions # 1.58 insn per cycle - 0.843747051 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index dd846fe890..2e35431afe 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,190 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:51:20 +DATE: 2024-01-31_14:44:15 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.783457e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.785575e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.785575e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.618103 sec - 5,426,392,715 cycles # 2.867 GHz - 11,041,442,286 instructions # 2.03 insn per cycle - 1.951599799 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.717389e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.717731e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.717731e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 +TOTAL : 4.722233 sec + 16,232,287,395 cycles:u # 3.418 GHz (74.90%) + 2,799,618 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) + 7,657,548 stalled-cycles-backend:u # 0.05% backend cycles idle (75.03%) + 13,101,972,238 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (75.07%) + 4.773465845 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.306053e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.319762e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.319762e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.925915 sec - 6,319,647,912 cycles # 2.872 GHz - 13,785,417,374 instructions # 2.18 insn per cycle - 2.259607907 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.157734e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.172831e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.172831e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 +TOTAL : 4.797989 sec + 16,471,525,685 cycles:u # 3.415 GHz (74.93%) + 3,709,532 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) + 51,132,370 stalled-cycles-backend:u # 0.31% backend cycles idle (74.87%) + 13,336,883,973 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.88%) + 4.844853430 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.635189e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.635467e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.635467e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.119375 sec - 17,637,027,949 cycles # 2.881 GHz - 51,787,792,256 instructions # 2.94 insn per cycle - 6.124243714 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.092875e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.092906e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.092906e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.833035 sec + 17,000,113,882 cycles:u # 3.502 GHz (74.96%) + 14,451,376 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.96%) + 1,952,835,950 stalled-cycles-backend:u # 11.49% backend cycles idle (74.96%) + 51,813,611,140 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (74.96%) + 4.857155972 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087330436E-003 -Relative difference = 2.119555946686223e-08 +Avg ME (F77/C++) = 9.8479612087414119E-003 +Relative difference = 2.1196409216982896e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.362357e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.362789e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.362789e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.576085 sec - 4,544,551,937 cycles # 2.877 GHz - 13,759,350,934 instructions # 3.03 insn per cycle - 1.581388093 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.562792e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.563331e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.563331e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.158831 sec + 4,108,365,889 cycles:u # 3.479 GHz (74.94%) + 1,184,231 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.94%) + 419,202,788 stalled-cycles-backend:u # 10.20% backend cycles idle (74.94%) + 13,778,170,314 instructions:u # 3.35 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 1.184304500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.701025e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.702845e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.702845e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.794089 sec - 2,138,661,629 cycles # 2.680 GHz - 4,826,930,405 instructions # 2.26 insn per cycle - 0.798991405 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.038104e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.038375e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.038375e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.510431 sec + 1,846,138,323 cycles:u # 3.465 GHz (74.63%) + 427,215 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.48%) + 151,340,586 stalled-cycles-backend:u # 8.20% backend cycles idle (74.48%) + 4,873,469,205 instructions:u # 2.64 insn per cycle + # 0.03 stalled cycles per insn (74.59%) + 0.535898006 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.613418e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.615510e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.615510e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.699716 sec - 1,882,009,512 cycles # 2.675 GHz - 4,259,439,384 instructions # 2.26 insn per cycle - 0.704552121 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.688489e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.690546e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.690546e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.795848 sec - 1,355,819,871 cycles # 1.696 GHz - 2,148,215,879 instructions # 1.58 insn per cycle - 0.800761416 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 90b9187b98..5518cc9752 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:09:16 +DATE: 2024-01-31_14:04:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.764318e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.765250e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.765666e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.705167 sec - 5,556,067,435 cycles # 2.852 GHz - 10,985,634,618 instructions # 1.98 insn per cycle - 2.060310498 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 1.778497e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.781890e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.781921e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 +TOTAL : 4.628898 sec + 15,907,548,391 cycles:u # 3.417 GHz (74.91%) + 2,873,282 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.86%) + 7,959,031 stalled-cycles-backend:u # 0.05% backend cycles idle (74.88%) + 12,933,345,737 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.95%) + 4.679899063 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.344230e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.345038e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.345205e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.928465 sec - 6,365,812,176 cycles # 2.870 GHz - 12,742,048,160 instructions # 2.00 insn per cycle - 2.275067290 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.132093e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.150020e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.150141e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 +TOTAL : 4.798027 sec + 16,471,697,372 cycles:u # 3.415 GHz (75.01%) + 3,060,544 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) + 46,584,268 stalled-cycles-backend:u # 0.28% backend cycles idle (74.95%) + 13,318,650,654 instructions:u # 0.81 insn per cycle + # 0.00 stalled cycles per insn (74.94%) + 4.845336497 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.849636e-03 -Avg ME (F77/CUDA) = 9.8712405367667715E-003 -Relative difference = 0.0021934350433631634 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.855155e-03 +Avg ME (F77/CUDA) = 9.8696023209835834E-003 +Relative difference = 0.0014659658811639687 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.700294e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.700564e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.700564e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 6.077926 sec - 17,558,502,709 cycles # 2.889 GHz - 51,759,109,121 instructions # 2.95 insn per cycle - 6.085026833 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.091904e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.091934e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.091934e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 +TOTAL : 4.836743 sec + 16,996,773,057 cycles:u # 3.498 GHz (74.97%) + 17,073,316 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.97%) + 1,656,388,245 stalled-cycles-backend:u # 9.75% backend cycles idle (74.97%) + 51,786,416,825 instructions:u # 3.05 insn per cycle + # 0.03 stalled cycles per insn (75.00%) + 4.860981674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087313262E-003 -Relative difference = 2.1195385077844924e-08 +Avg ME (F77/C++) = 9.8479612087396841E-003 +Relative difference = 2.119623377106246e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.376771e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.377174e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.377174e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 -TOTAL : 1.572289 sec - 4,548,603,521 cycles # 2.891 GHz - 13,758,604,883 instructions # 3.02 insn per cycle - 1.583710945 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.559383e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.559952e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.559952e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 +TOTAL : 1.159399 sec + 4,119,848,450 cycles:u # 3.487 GHz (75.03%) + 611,148 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) + 401,429,588 stalled-cycles-backend:u # 9.74% backend cycles idle (74.95%) + 13,796,637,357 instructions:u # 3.35 insn per cycle + # 0.03 stalled cycles per insn (74.95%) + 1.184833688 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546894727158E-003 -Relative difference = 3.1532159158088894e-08 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847957e-03 +Avg ME (F77/C++) = 9.8479574833965355E-003 +Relative difference = 4.9085971470122835e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.592179e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.593820e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.593820e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.808909 sec - 2,140,416,404 cycles # 2.637 GHz - 4,826,824,873 instructions # 2.26 insn per cycle - 0.906681144 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 -OK (relative difference <= 5E-3) +EvtsPerSec[Rmb+ME] (23) = ( 1.020958e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.021239e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.021239e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 +TOTAL : 0.518489 sec + 1,875,147,196 cycles:u # 3.469 GHz (75.01%) + 964,959 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.85%) + 170,331,058 stalled-cycles-backend:u # 9.08% backend cycles idle (74.85%) + 4,863,058,667 instructions:u # 2.59 insn per cycle + # 0.04 stalled cycles per insn (74.85%) + 0.543762355 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.677326e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.679741e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.679741e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 -TOTAL : 0.695609 sec - 1,868,752,206 cycles # 2.678 GHz - 4,259,067,854 instructions # 2.28 insn per cycle - 0.708960929 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728159608508E-003 -Relative difference = 1.8603017364363385e-08 +Avg ME (F77/C++) = 9.8929728161012351E-003 +Relative difference = 1.8588827066662492e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.775075e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.777182e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.777182e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 -TOTAL : 0.788693 sec - 1,354,650,321 cycles # 1.715 GHz - 2,148,091,187 instructions # 1.59 insn per cycle - 0.801177717 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892980e-03 -Avg ME (F77/C++) = 9.8929802670331551E-003 -Relative difference = 2.699218597469717e-08 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 4eda45e114..5178c8cf68 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:10:05 +DATE: 2024-01-31_14:05:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.692959e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.693612e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.693848e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.179406 sec - 7,155,207,889 cycles # 2.861 GHz - 14,615,335,571 instructions # 2.04 insn per cycle - 2.559881855 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.387501e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.392699e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.392765e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 10.047124 sec + 34,815,668,507 cycles:u # 3.456 GHz (75.00%) + 3,625,381 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) + 8,034,622 stalled-cycles-backend:u # 0.02% backend cycles idle (74.94%) + 27,505,142,119 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.01%) + 10.097930678 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.111470e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111782e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111825e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.413893 sec - 10,746,707,284 cycles # 2.875 GHz - 23,674,149,917 instructions # 2.20 insn per cycle - 3.796927749 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.191872e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.195128e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.195154e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.583926 sec + 33,210,848,611 cycles:u # 3.456 GHz (74.96%) + 3,988,741 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) + 56,320,623 stalled-cycles-backend:u # 0.17% backend cycles idle (75.03%) + 26,173,670,767 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.03%) + 9.633227442 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.884803e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.885022e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.885022e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.704107 sec - 19,257,123,030 cycles # 2.874 GHz - 55,394,447,460 instructions # 2.88 insn per cycle - 6.709385430 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.017456e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.017484e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.017484e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.191105 sec + 18,261,367,497 cycles:u # 3.503 GHz (74.99%) + 32,360,708 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.99%) + 2,211,411,467 stalled-cycles-backend:u # 12.11% backend cycles idle (74.99%) + 55,409,256,470 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (74.99%) + 5.215335296 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.509946e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510039e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510039e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.502177 sec - 9,384,694,038 cycles # 2.677 GHz - 25,874,743,625 instructions # 2.76 insn per cycle - 3.507349921 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.327899e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.328033e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.328033e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.269681 sec + 8,004,167,296 cycles:u # 3.492 GHz (74.87%) + 1,017,916 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) + 775,823,367 stalled-cycles-backend:u # 9.69% backend cycles idle (74.90%) + 25,920,796,565 instructions:u # 3.24 insn per cycle + # 0.03 stalled cycles per insn (75.01%) + 2.295300917 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.557555e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.558062e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.558062e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.490188 sec - 4,000,749,453 cycles # 2.678 GHz - 9,119,038,902 instructions # 2.28 insn per cycle - 1.495279789 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.411886e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.412622e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.412622e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.977034 sec + 3,480,538,606 cycles:u # 3.483 GHz (74.73%) + 1,928,349 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.07%) + 290,676,915 stalled-cycles-backend:u # 8.35% backend cycles idle (75.19%) + 9,134,232,930 instructions:u # 2.62 insn per cycle + # 0.03 stalled cycles per insn (75.19%) + 1.002459987 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.057405e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.058069e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.058069e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.307627 sec - 3,513,640,690 cycles # 2.679 GHz - 8,029,011,845 instructions # 2.29 insn per cycle - 1.312711431 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.350506e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.351010e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.351010e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.581908 sec - 2,606,864,065 cycles # 1.673 GHz - 4,077,382,976 instructions # 1.56 insn per cycle - 1.587144818 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 328b61834e..6ac6df302f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,223 +1,181 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-30_05:11:08 +DATE: 2024-01-31_14:07:31 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.684370e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.684951e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.685153e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.181079 sec - 7,148,088,261 cycles # 2.853 GHz - 14,239,530,947 instructions # 1.99 insn per cycle - 2.562146879 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +EvtsPerSec[Rmb+ME] (23) = ( 7.497198e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.502430e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.502469e+01 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 9.923800 sec + 34,389,805,654 cycles:u # 3.456 GHz (75.01%) + 3,707,069 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) + 7,302,220 stalled-cycles-backend:u # 0.02% backend cycles idle (74.99%) + 27,148,100,293 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (74.99%) + 9.977924992 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.111591e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111914e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111956e+04 ) sec^-1 -MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.413150 sec - 10,755,861,454 cycles # 2.876 GHz - 23,518,245,564 instructions # 2.19 insn per cycle - 3.796500341 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.224267e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.227630e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.227658e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 +TOTAL : 9.529095 sec + 33,040,071,235 cycles:u # 3.458 GHz (74.96%) + 3,926,744 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) + 55,408,476 stalled-cycles-backend:u # 0.17% backend cycles idle (75.03%) + 26,027,529,168 instructions:u # 0.79 insn per cycle + # 0.00 stalled cycles per insn (75.05%) + 9.578892739 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656498E-003 -Relative difference = 3.1385249252060663e-07 +Avg ME (F77/CUDA) = 9.8722599015656533E-003 +Relative difference = 3.138524921691728e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.912565e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.912803e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.912803e+01 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.680088 sec - 19,228,329,737 cycles # 2.877 GHz - 55,419,296,273 instructions # 2.88 insn per cycle - 6.685533383 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.020515e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.020542e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.020542e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 5.175128 sec + 18,196,693,033 cycles:u # 3.502 GHz (74.91%) + 28,359,547 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.92%) + 2,265,467,080 stalled-cycles-backend:u # 12.45% backend cycles idle (74.97%) + 55,450,832,000 instructions:u # 3.05 insn per cycle + # 0.04 stalled cycles per insn (75.04%) + 5.199149057 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.515454e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.515537e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.515537e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.490021 sec - 9,348,051,078 cycles # 2.676 GHz - 25,823,110,897 instructions # 2.76 insn per cycle - 3.495053121 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.338198e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.338337e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338337e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 2.259325 sec + 7,979,854,438 cycles:u # 3.497 GHz (74.80%) + 1,426,621 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) + 851,798,980 stalled-cycles-backend:u # 10.67% backend cycles idle (75.11%) + 25,830,275,782 instructions:u # 3.24 insn per cycle + # 0.03 stalled cycles per insn (75.11%) + 2.285086237 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.556805e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.557285e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.557285e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.490221 sec - 4,003,060,439 cycles # 2.680 GHz - 9,098,942,911 instructions # 2.27 insn per cycle - 1.495311791 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.471821e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.472557e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.472557e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 +TOTAL : 0.966338 sec + 3,444,953,941 cycles:u # 3.485 GHz (75.03%) + 1,775,471 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.92%) + 287,668,953 stalled-cycles-backend:u # 8.35% backend cycles idle (74.92%) + 9,128,413,240 instructions:u # 2.65 insn per cycle + # 0.03 stalled cycles per insn (74.92%) + 0.991652980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.083203e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.083821e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.083821e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.299137 sec - 3,488,850,980 cycles # 2.678 GHz - 8,010,474,997 instructions # 2.30 insn per cycle - 1.304443015 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440905e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.441442e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.441442e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.541232 sec - 2,598,862,718 cycles # 1.682 GHz - 4,064,975,706 instructions # 1.56 insn per cycle - 1.546247038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.872263e-03 -Avg ME (F77/C++) = 9.8722594324461913E-003 -Relative difference = 3.613714310412983e-07 -OK (relative difference <= 5E-3) +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 5667ce458e..69902e7516 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,223 +1,108 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:04:25 +DATE: 2024-01-31_13:58:50 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.650880e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.304183e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.677107e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.453043 sec - 1,889,864,608 cycles # 2.824 GHz - 2,684,689,341 instructions # 1.42 insn per cycle - 0.749142975 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 762,819,983 cycles:u # 2.210 GHz (74.53%) + 2,579,368 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.81%) + 24,249,934 stalled-cycles-backend:u # 3.18% backend cycles idle (75.94%) + 1,261,592,345 instructions:u # 1.65 insn per cycle + # 0.02 stalled cycles per insn (74.30%) + 0.374903403 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.266493e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.111955e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.526543e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.538455 sec - 2,216,644,376 cycles # 2.828 GHz - 3,102,394,165 instructions # 1.40 insn per cycle - 0.841378524 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault + 2,642,925,381 cycles:u # 2.746 GHz (75.05%) + 14,864,654 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.07%) + 291,338,514 stalled-cycles-backend:u # 11.02% backend cycles idle (74.64%) + 2,547,011,174 instructions:u # 0.96 insn per cycle + # 0.11 stalled cycles per insn (74.36%) + 0.985874750 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939ee0) on address 0x1456d31c9000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x145968539dbf in ??? +#1 0x145968539d2b in ??? +#2 0x14596853b3e4 in ??? +#3 0x145960a0cb64 in ??? +#4 0x145960a09b38 in ??? +#5 0x1459609c7496 in ??? +#6 0x1459684d36e9 in ??? +#7 0x14596860749e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.822300e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.003024e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.003024e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.695633 sec - 4,892,910,077 cycles # 2.883 GHz - 13,801,787,359 instructions # 2.82 insn per cycle - 1.705964185 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.173381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193114e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193114e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.419762 sec + 5,029,284,507 cycles:u # 3.488 GHz (74.89%) + 2,404,209 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) + 674,718,308 stalled-cycles-backend:u # 13.42% backend cycles idle (75.04%) + 13,812,210,452 instructions:u # 2.75 insn per cycle + # 0.05 stalled cycles per insn (75.04%) + 1.444206774 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.896648e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.972375e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.972375e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.887238 sec - 2,571,261,116 cycles # 2.883 GHz - 7,401,200,610 instructions # 2.88 insn per cycle - 0.906229412 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.154928e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.367723e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.367723e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.543357 sec - 1,480,133,709 cycles # 2.701 GHz - 3,136,765,286 instructions # 2.12 insn per cycle - 0.561297241 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.571891e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.844626e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.844626e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.482249 sec - 1,314,348,676 cycles # 2.699 GHz - 2,923,288,921 instructions # 2.22 insn per cycle - 0.498803372 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.408041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.532332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.532332e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.706292 sec - 1,273,944,985 cycles # 1.792 GHz - 1,900,262,296 instructions # 1.49 insn per cycle - 0.723222352 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63f5d0) on address 0x153411a99000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 7b59743406..32380b0244 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,240 +1,115 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:48:29 +DATE: 2024-01-31_14:41:01 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.408359e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.101986e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.101986e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.481872 sec - 1,962,034,459 cycles # 2.824 GHz - 2,925,170,965 instructions # 1.49 insn per cycle - 0.753942373 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 933,839,268 cycles:u # 2.433 GHz (74.16%) + 2,759,499 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.40%) + 38,991,447 stalled-cycles-backend:u # 4.18% backend cycles idle (75.69%) + 1,391,319,621 instructions:u # 1.49 insn per cycle + # 0.03 stalled cycles per insn (75.61%) + 0.714875699 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.119182e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.257748e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.257748e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.770186 sec - 2,924,566,680 cycles # 2.837 GHz - 4,475,846,392 instructions # 1.53 insn per cycle - 1.089161093 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted + 3,210,881,286 cycles:u # 2.813 GHz (74.81%) + 30,342,476 stalled-cycles-frontend:u # 0.94% frontend cycles idle (75.10%) + 856,548,279 stalled-cycles-backend:u # 26.68% backend cycles idle (75.42%) + 3,341,623,817 instructions:u # 1.04 insn per cycle + # 0.26 stalled cycles per insn (75.24%) + 1.262099462 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939ee0) on address 0x153f85189000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x15421a4fbdbf in ??? +#1 0x15421a4fbd2b in ??? +#2 0x15421a4fd3e4 in ??? +#3 0x1542129ceb64 in ??? +#4 0x1542129cbb38 in ??? +#5 0x154212989496 in ??? +#6 0x15421a4956e9 in ??? +#7 0x15421a5c949e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.824024e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.002926e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.002926e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.698586 sec - 4,927,814,709 cycles # 2.894 GHz - 13,806,118,322 instructions # 2.80 insn per cycle - 1.704123738 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.172468e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.192155e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.424936 sec + 5,033,997,033 cycles:u # 3.476 GHz (74.84%) + 2,462,644 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.07%) + 665,583,775 stalled-cycles-backend:u # 13.22% backend cycles idle (75.15%) + 13,809,895,327 instructions:u # 2.74 insn per cycle + # 0.05 stalled cycles per insn (75.15%) + 1.450408660 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.886173e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.963508e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.963508e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.900867 sec - 2,618,017,951 cycles # 2.892 GHz - 7,450,102,141 instructions # 2.85 insn per cycle - 0.906367581 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.122916e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.345144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.345144e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.557525 sec - 1,528,674,468 cycles # 2.721 GHz - 3,187,083,360 instructions # 2.08 insn per cycle - 0.563020024 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.528840e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.810605e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.810605e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.496872 sec - 1,359,999,193 cycles # 2.712 GHz - 2,973,904,476 instructions # 2.19 insn per cycle - 0.502643224 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.332416e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.457397e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.457397e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.738182 sec - 1,327,509,915 cycles # 1.788 GHz - 1,939,124,841 instructions # 1.46 insn per cycle - 0.743808066 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63f5d0) on address 0x14e81f6b9000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 4deacb88f2..bccb6906c3 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,223 +1,108 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:04:44 +DATE: 2024-01-31_13:59:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.642894e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.200887e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.567165e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.451244 sec - 1,883,873,657 cycles # 2.821 GHz - 2,671,262,226 instructions # 1.42 insn per cycle - 0.747348766 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 740,918,663 cycles:u # 2.126 GHz (74.83%) + 2,590,438 stalled-cycles-frontend:u # 0.35% frontend cycles idle (76.36%) + 35,134,179 stalled-cycles-backend:u # 4.74% backend cycles idle (78.16%) + 1,223,153,146 instructions:u # 1.65 insn per cycle + # 0.03 stalled cycles per insn (74.88%) + 0.399372258 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.228371e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.990649e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.395918e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.540844 sec - 2,218,030,903 cycles # 2.829 GHz - 3,154,136,532 instructions # 1.42 insn per cycle - 0.843504278 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault + 2,633,479,698 cycles:u # 2.738 GHz (75.04%) + 21,030,407 stalled-cycles-frontend:u # 0.80% frontend cycles idle (75.06%) + 860,456,505 stalled-cycles-backend:u # 32.67% backend cycles idle (74.56%) + 2,537,896,925 instructions:u # 0.96 insn per cycle + # 0.34 stalled cycles per insn (74.39%) + 0.985747886 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482467490466 -Relative difference = 5.286902838873106e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939e30) on address 0x152882c79000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x152b17febdbf in ??? +#1 0x152b17febd2b in ??? +#2 0x152b17fed3e4 in ??? +#3 0x152b104beb64 in ??? +#4 0x152b104bbb38 in ??? +#5 0x152b10479496 in ??? +#6 0x152b17f856e9 in ??? +#7 0x152b180b949e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.831536e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.003712e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.003712e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.690067 sec - 4,884,610,591 cycles # 2.883 GHz - 13,807,943,276 instructions # 2.83 insn per cycle - 1.700194727 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.173724e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.193378e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.193378e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.419080 sec + 5,007,260,230 cycles:u # 3.475 GHz (75.02%) + 2,377,599 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.02%) + 879,033,575 stalled-cycles-backend:u # 17.56% backend cycles idle (75.02%) + 13,839,303,650 instructions:u # 2.76 insn per cycle + # 0.06 stalled cycles per insn (75.03%) + 1.443186270 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499481 -Relative difference = 5.286896511435107e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.876876e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.953061e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953061e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.896918 sec - 2,573,000,483 cycles # 2.854 GHz - 7,407,132,972 instructions # 2.88 insn per cycle - 0.971480588 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499475 -Relative difference = 5.286896515331313e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.133331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.344053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.344053e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.546739 sec - 1,486,856,812 cycles # 2.696 GHz - 3,137,676,944 instructions # 2.11 insn per cycle - 0.563341736 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.567673e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.839669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.839669e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.482732 sec - 1,314,507,412 cycles # 2.697 GHz - 2,925,746,939 instructions # 2.23 insn per cycle - 0.501062508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = DOUBLE (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.394430e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.516439e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.516439e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.710071 sec - 1,273,890,672 cycles # 1.782 GHz - 1,899,944,131 instructions # 1.49 insn per cycle - 0.727352268 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467492595 -Relative difference = 5.286901344678233e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x666280) on address 0x14c889299000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 1362a87ac8..b69cf112a4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,223 +1,108 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:05:03 +DATE: 2024-01-31_13:59:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.327203e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.210086e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.349272e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.446159 sec - 1,908,363,704 cycles # 2.829 GHz - 2,678,040,252 instructions # 1.40 insn per cycle - 0.749417997 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 760,941,120 cycles:u # 2.190 GHz (75.67%) + 2,577,448 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.40%) + 21,547,300 stalled-cycles-backend:u # 2.83% backend cycles idle (75.80%) + 1,198,745,029 instructions:u # 1.58 insn per cycle + # 0.02 stalled cycles per insn (76.69%) + 0.375099933 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.267889e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.817352e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.969269e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.483640 sec - 2,013,701,507 cycles # 2.833 GHz - 2,869,047,503 instructions # 1.42 insn per cycle - 0.770237631 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 2,539,411,178 cycles:u # 2.802 GHz (75.28%) + 21,101,958 stalled-cycles-frontend:u # 0.83% frontend cycles idle (75.32%) + 853,164,959 stalled-cycles-backend:u # 33.60% backend cycles idle (75.30%) + 2,424,669,574 instructions:u # 0.95 insn per cycle + # 0.35 stalled cycles per insn (75.22%) + 0.944209481 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6937f00) on address 0x150f923fc000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x151227767dbf in ??? +#1 0x151227767d2b in ??? +#2 0x1512277693e4 in ??? +#3 0x15121fc3ab64 in ??? +#4 0x15121fc37b38 in ??? +#5 0x15121fbf5496 in ??? +#6 0x1512277016e9 in ??? +#7 0x15122783549e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.109983e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136218e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.136218e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.498447 sec - 4,345,988,139 cycles # 2.893 GHz - 12,596,967,872 instructions # 2.90 insn per cycle - 1.511882134 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.428037e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.458241e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.458241e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.168612 sec + 4,133,128,597 cycles:u # 3.471 GHz (75.15%) + 2,028,802 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.15%) + 246,889,606 stalled-cycles-backend:u # 5.97% backend cycles idle (74.82%) + 12,624,578,314 instructions:u # 3.05 insn per cycle + # 0.02 stalled cycles per insn (74.82%) + 1.192857887 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.116392e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.330955e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.330955e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.547469 sec - 1,595,191,710 cycles # 2.889 GHz - 4,246,785,925 instructions # 2.66 insn per cycle - 0.566121323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.705122e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.431194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.431194e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.309154 sec - 853,106,357 cycles # 2.719 GHz - 1,916,236,758 instructions # 2.25 insn per cycle - 0.322202646 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.291153e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.186493e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.186493e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.282114 sec - 781,605,305 cycles # 2.726 GHz - 1,797,850,243 instructions # 2.30 insn per cycle - 0.301017972 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.544342e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.998908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.998908e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.384301 sec - 720,859,118 cycles # 1.854 GHz - 1,288,039,773 instructions # 1.79 insn per cycle - 0.402338897 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x14bb83fc4000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 8cb59221d4..3197cced27 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,240 +1,115 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:48:48 +DATE: 2024-01-31_14:41:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.444132e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.000397e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.000397e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.460393 sec - 1,902,959,760 cycles # 2.835 GHz - 2,813,040,217 instructions # 1.48 insn per cycle - 0.731144371 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 802,769,560 cycles:u # 2.308 GHz (73.70%) + 2,883,518 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.71%) + 21,503,278 stalled-cycles-backend:u # 2.68% backend cycles idle (74.79%) + 1,254,584,929 instructions:u # 1.56 insn per cycle + # 0.02 stalled cycles per insn (74.79%) + 0.371915527 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost -WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.962207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.533260e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.533260e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.631701 sec - 2,471,933,418 cycles # 2.836 GHz - 3,725,494,141 instructions # 1.51 insn per cycle - 0.929474422 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault + 2,881,932,730 cycles:u # 2.848 GHz (74.69%) + 29,590,208 stalled-cycles-frontend:u # 1.03% frontend cycles idle (75.61%) + 854,011,561 stalled-cycles-backend:u # 29.63% backend cycles idle (75.50%) + 3,070,710,792 instructions:u # 1.07 insn per cycle + # 0.28 stalled cycles per insn (75.46%) + 1.032695848 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6937f00) on address 0x1463d05f4000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14666595fdbf in ??? +#1 0x14666595fd2b in ??? +#2 0x1466659613e4 in ??? +#3 0x14665de32b64 in ??? +#4 0x14665de2fb38 in ??? +#5 0x14665dded496 in ??? +#6 0x1466658f96e9 in ??? +#7 0x146665a2d49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.095228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.121318e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.121318e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.522701 sec - 4,367,827,655 cycles # 2.862 GHz - 12,601,331,452 instructions # 2.89 insn per cycle - 1.527862957 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.426628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.456791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.456791e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.171749 sec + 4,150,620,502 cycles:u # 3.475 GHz (74.60%) + 2,341,963 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.71%) + 249,108,394 stalled-cycles-backend:u # 6.00% backend cycles idle (75.00%) + 12,628,090,084 instructions:u # 3.04 insn per cycle + # 0.02 stalled cycles per insn (75.23%) + 1.196363960 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.075499e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.292736e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.292736e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.559885 sec - 1,623,222,211 cycles # 2.878 GHz - 4,293,732,841 instructions # 2.65 insn per cycle - 0.565168184 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.618798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.338072e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.338072e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.317968 sec - 874,954,516 cycles # 2.715 GHz - 1,952,010,632 instructions # 2.23 insn per cycle - 0.323135602 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.140069e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.015278e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.015278e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.296217 sec - 805,080,990 cycles # 2.697 GHz - 1,834,280,964 instructions # 2.28 insn per cycle - 0.301462842 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.472935e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.920053e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.920053e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.395002 sec - 745,120,207 cycles # 1.866 GHz - 1,329,072,598 instructions # 1.78 insn per cycle - 0.400211929 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x15313d7cc000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index a71ead3e03..0cf6149acc 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,223 +1,108 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:05:20 +DATE: 2024-01-31_13:59:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.328749e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.215965e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.352409e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.447516 sec - 1,904,038,107 cycles # 2.819 GHz - 2,679,740,960 instructions # 1.41 insn per cycle - 0.754557698 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 758,925,185 cycles:u # 2.206 GHz (74.25%) + 2,574,941 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.49%) + 28,586,343 stalled-cycles-backend:u # 3.77% backend cycles idle (77.25%) + 1,212,929,372 instructions:u # 1.60 insn per cycle + # 0.02 stalled cycles per insn (77.63%) + 0.366863310 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.182679e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.774687e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.914662e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.483893 sec - 2,007,596,287 cycles # 2.824 GHz - 2,863,986,921 instructions # 1.43 insn per cycle - 0.770182944 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault + 2,598,152,662 cycles:u # 2.864 GHz (74.45%) + 20,928,138 stalled-cycles-frontend:u # 0.81% frontend cycles idle (75.39%) + 845,268,440 stalled-cycles-backend:u # 32.53% backend cycles idle (75.31%) + 2,412,052,205 instructions:u # 0.93 insn per cycle + # 0.35 stalled cycles per insn (75.26%) + 0.930068923 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424226e-01 -Avg ME (F77/CUDA) = 0.14247488790821983 -Relative difference = 0.00036713209996037764 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6937e50) on address 0x15510025c000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x1553955c7dbf in ??? +#1 0x1553955c7d2b in ??? +#2 0x1553955c93e4 in ??? +#3 0x15538da9ab64 in ??? +#4 0x15538da97b38 in ??? +#5 0x15538da55496 in ??? +#6 0x1553955616e9 in ??? +#7 0x15539569549e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.104449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.131163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.131163e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 1.505306 sec - 4,350,737,729 cycles # 2.883 GHz - 12,588,700,465 instructions # 2.89 insn per cycle - 1.517040580 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.424138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.454255e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.454255e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 +TOTAL : 1.171627 sec + 4,157,880,455 cycles:u # 3.484 GHz (74.58%) + 2,042,320 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) + 521,141,426 stalled-cycles-backend:u # 12.53% backend cycles idle (75.04%) + 12,614,300,431 instructions:u # 3.03 insn per cycle + # 0.04 stalled cycles per insn (75.21%) + 1.195482351 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860569653919 -Relative difference = 3.998452420257791e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.107801e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.322563e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.322563e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.548678 sec - 1,589,053,041 cycles # 2.872 GHz - 4,241,478,972 instructions # 2.67 insn per cycle - 0.565533397 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246860808920836 -Relative difference = 5.677888572434963e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.682195e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.406347e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.406347e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.310286 sec - 851,032,417 cycles # 2.702 GHz - 1,913,907,734 instructions # 2.25 insn per cycle - 0.327654627 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.251030e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.131063e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.131063e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.283621 sec - 779,432,148 cycles # 2.704 GHz - 1,795,928,128 instructions # 2.30 insn per cycle - 0.301196370 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490815036912 -Relative difference = 5.7205649062398515e-08 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = FLOAT (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.530328e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.979352e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.979352e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.386557 sec - 722,333,254 cycles # 1.844 GHz - 1,287,373,146 instructions # 1.78 insn per cycle - 0.407217093 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247490450137867 -Relative difference = 3.159418737238044e-08 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x643e60) on address 0x14ffe51a4000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 3f17b073e2..190fa5e35f 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,223 +1,108 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:05:38 +DATE: 2024-01-31_13:59:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.696364e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.334716e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.710197e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.455743 sec - 1,899,569,009 cycles # 2.822 GHz - 2,690,270,670 instructions # 1.42 insn per cycle - 0.752301124 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 748,191,462 cycles:u # 2.174 GHz (74.27%) + 2,454,240 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.57%) + 27,578,613 stalled-cycles-backend:u # 3.69% backend cycles idle (76.65%) + 1,231,148,808 instructions:u # 1.65 insn per cycle + # 0.02 stalled cycles per insn (74.50%) + 0.367922296 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.256330e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.134663e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.562668e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.544746 sec - 2,203,075,600 cycles # 2.810 GHz - 3,150,811,707 instructions # 1.43 insn per cycle - 0.843284004 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault + 2,684,163,642 cycles:u # 2.783 GHz (74.18%) + 21,756,682 stalled-cycles-frontend:u # 0.81% frontend cycles idle (73.20%) + 853,260,207 stalled-cycles-backend:u # 31.79% backend cycles idle (75.20%) + 2,469,863,103 instructions:u # 0.92 insn per cycle + # 0.35 stalled cycles per insn (76.02%) + 0.985386297 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939ee0) on address 0x154ce61e9000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x154f7b54edbf in ??? +#1 0x154f7b54ed2b in ??? +#2 0x154f7b5503e4 in ??? +#3 0x154f73a21b64 in ??? +#4 0x154f73a1eb38 in ??? +#5 0x154f739dc496 in ??? +#6 0x154f7b4e86e9 in ??? +#7 0x154f7b61c49e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.796791e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000139e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000139e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.696226 sec - 4,903,205,903 cycles # 2.884 GHz - 13,824,553,372 instructions # 2.82 insn per cycle - 1.707005330 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.167145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.186639e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.186639e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.427162 sec + 5,051,946,561 cycles:u # 3.484 GHz (74.72%) + 2,305,632 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.18%) + 850,828,752 stalled-cycles-backend:u # 16.84% backend cycles idle (75.18%) + 13,842,447,352 instructions:u # 2.74 insn per cycle + # 0.06 stalled cycles per insn (75.19%) + 1.452171813 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.870381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.944831e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.944831e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.899365 sec - 2,603,553,029 cycles # 2.880 GHz - 7,349,607,266 instructions # 2.82 insn per cycle - 0.916195330 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.167537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.382178e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.382178e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.541013 sec - 1,471,630,021 cycles # 2.697 GHz - 3,084,577,547 instructions # 2.10 insn per cycle - 0.558891839 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.661938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.948590e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.948590e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.471097 sec - 1,285,426,170 cycles # 2.700 GHz - 2,873,286,331 instructions # 2.24 insn per cycle - 0.489244149 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.322096e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.437722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.437722e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.731479 sec - 1,311,962,532 cycles # 1.782 GHz - 1,915,335,630 instructions # 1.46 insn per cycle - 0.746286183 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x63f5d0) on address 0x146962c59000. Reason: Unknown. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 7294ddea09..87faa41e06 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,223 +1,108 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS=-fopenmp -AVX=512y +Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS= +AVX=avx2 FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasCurand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) +RNDGEN=hasNoCurand +Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-30_05:05:56 +DATE: 2024-01-31_14:00:02 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.635631e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.151573e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.502163e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.454560 sec - 1,887,319,720 cycles # 2.810 GHz - 2,686,521,155 instructions # 1.42 insn per cycle - 0.777570467 seconds time elapsed -runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 -==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 727,187,414 cycles:u # 2.061 GHz (75.95%) + 2,715,028 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.84%) + 41,823,987 stalled-cycles-backend:u # 5.75% backend cycles idle (70.58%) + 1,257,681,608 instructions:u # 1.73 insn per cycle + # 0.03 stalled cycles per insn (74.11%) + 0.375261475 seconds time elapsed ......................................................................... -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.262333e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.007147e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.410963e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.538673 sec - 2,205,224,099 cycles # 2.822 GHz - 3,150,366,927 instructions # 1.43 insn per cycle - 0.838863876 seconds time elapsed +/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault + 2,633,015,947 cycles:u # 2.735 GHz (75.09%) + 21,153,768 stalled-cycles-frontend:u # 0.80% frontend cycles idle (74.68%) + 867,174,013 stalled-cycles-backend:u # 32.93% backend cycles idle (73.99%) + 2,529,840,015 instructions:u # 0.96 insn per cycle + # 0.34 stalled cycles per insn (74.62%) + 0.985921030 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.424749e-01 -Avg ME (F77/CUDA) = 0.14247482577104625 -Relative difference = 5.209967070245855e-07 -OK (relative difference <= 5E-3) +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Memory access fault by GPU node-4 (Agent handle: 0x6939e30) on address 0x14e191159000. Reason: Unknown. + +Program received signal SIGABRT: Process abort signal. + +Backtrace for this error: +#0 0x14e4264c7dbf in ??? +#1 0x14e4264c7d2b in ??? +#2 0x14e4264c93e4 in ??? +#3 0x14e41e99ab64 in ??? +#4 0x14e41e997b38 in ??? +#5 0x14e41e955496 in ??? +#6 0x14e4264616e9 in ??? +#7 0x14e42659549e in ??? +#8 0xffffffffffffffff in ??? +Avg ME (C++/CUDA) = +Avg ME (F77/CUDA) = +ERROR! Fortran calculation (F77/CUDA) crashed ========================================================================= -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.769998e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.971532e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.971532e+04 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.700454 sec - 4,910,062,395 cycles # 2.880 GHz - 13,831,764,171 instructions # 2.82 insn per cycle - 1.712052278 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.168746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188251e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188251e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 +TOTAL : 1.425088 sec + 5,052,848,302 cycles:u # 3.491 GHz (74.63%) + 2,441,774 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.88%) + 796,259,995 stalled-cycles-backend:u # 15.76% backend cycles idle (75.11%) + 13,840,635,874 instructions:u # 2.74 insn per cycle + # 0.06 stalled cycles per insn (75.13%) + 1.449175351 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.857842e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.932046e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.932046e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.905414 sec - 2,615,099,772 cycles # 2.873 GHz - 7,353,136,311 instructions # 2.81 insn per cycle - 0.925236073 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482734618697 -Relative difference = 5.099411406595165e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.160999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.374264e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.374264e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.541919 sec - 1,475,084,747 cycles # 2.698 GHz - 3,084,915,220 instructions # 2.09 insn per cycle - 0.559487031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.676411e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.967587e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.967587e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.469154 sec - 1,285,211,957 cycles # 2.712 GHz - 2,875,140,516 instructions # 2.24 insn per cycle - 0.485058196 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= -WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -FP precision = MIXED (NaN/abnormal=0, zero=0) -Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.334432e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.451352e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.451352e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.728206 sec - 1,313,839,367 cycles # 1.794 GHz - 1,915,620,790 instructions # 1.46 insn per cycle - 0.743678029 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) -------------------------------------------------------------------------- -runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 6 tests. -------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482643254802 -Relative difference = 5.163537715318965e-07 -OK (relative difference <= 5E-3) -========================================================================= - -TEST COMPLETED +runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +Memory access fault by GPU node-4 (Agent handle: 0x666280) on address 0x146fe5329000. Reason: Unknown. From 227ac86f2cd7b0c46560fee3b5114bbc8b05b5d5 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 31 Jan 2024 18:27:34 +0200 Subject: [PATCH 95/96] [jt774] rerun all 18 tmad tests on LUMI - same issues as before for gqttq (#806) NB this is "./tmad/allTees.sh" WITHOUT the -hip flag (no "-rorhst" added) STARTED AT Wed 31 Jan 2024 02:54:59 PM EET ENDED AT Wed 31 Jan 2024 06:02:10 PM EET Status=0 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt 16 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt 12 /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt --- .../log_eemumu_mad_d_inl0_hrd0.txt | 422 ++++++---------- .../log_eemumu_mad_f_inl0_hrd0.txt | 424 ++++++----------- .../log_eemumu_mad_m_inl0_hrd0.txt | 424 ++++++----------- .../log_ggtt_mad_d_inl0_hrd0.txt | 422 ++++++---------- .../log_ggtt_mad_f_inl0_hrd0.txt | 422 ++++++---------- .../log_ggtt_mad_m_inl0_hrd0.txt | 418 ++++++---------- .../log_ggttg_mad_d_inl0_hrd0.txt | 422 ++++++---------- .../log_ggttg_mad_f_inl0_hrd0.txt | 422 ++++++---------- .../log_ggttg_mad_m_inl0_hrd0.txt | 420 ++++++---------- .../log_ggttgg_mad_d_inl0_hrd0.txt | 422 ++++++---------- .../log_ggttgg_mad_f_inl0_hrd0.txt | 420 ++++++---------- .../log_ggttgg_mad_m_inl0_hrd0.txt | 422 ++++++---------- .../log_ggttggg_mad_d_inl0_hrd0.txt | 424 ++++++----------- .../log_ggttggg_mad_f_inl0_hrd0.txt | 424 ++++++----------- .../log_ggttggg_mad_m_inl0_hrd0.txt | 420 ++++++---------- .../log_gqttq_mad_d_inl0_hrd0.txt | 449 +++++------------- .../log_gqttq_mad_f_inl0_hrd0.txt | 449 +++++------------- .../log_gqttq_mad_m_inl0_hrd0.txt | 443 +++++------------ 18 files changed, 2378 insertions(+), 5291 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 459e70d382..f1aae0ab32 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-30_06:09:28 +DATE: 2024-01-31_15:11:39 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6491s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6403s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5147s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1850s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s - [COUNTERS] Fortran MEs ( 1 ) : 0.0089s for 8192 events => throughput is 9.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1389s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1329s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4417s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3482s - [COUNTERS] Fortran MEs ( 1 ) : 0.0935s for 90112 events => throughput is 9.64E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2608s + [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681790] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1496s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1436s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681790) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,27 +167,27 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4377s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3566s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0811s for 90112 events => throughput is 1.11E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3369s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2716s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0652s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.115404e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.419345e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.135181e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.429602e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1854s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1812s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.96E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1461s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1426s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681787) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0476s for 90112 events => throughput is 1.89E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3081s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0384s for 90112 events => throughput is 2.35E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813683E-002) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.873422e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.383624e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.997339e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.448451e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1841s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1809s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1436s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1411s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.31E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681787) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3915s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3558s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 90112 events => throughput is 2.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2943s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2673s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 90112 events => throughput is 3.34E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813683E-002) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.563581e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.701936e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1820s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1791s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.77E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3886s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0336s for 90112 events => throughput is 2.68E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.733041e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.415643e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.837193e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1852s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3986s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3589s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0397s for 90112 events => throughput is 2.27E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.512459e+06 ) sec^-1 -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.277912e+06 ) sec^-1 +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.388817e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6311s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6306s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4152s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 1.96E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681787) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7893s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7841s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.73E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5421s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.03E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.924722e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.179667e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.934037e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.504851e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.691813e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.224816e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.449601e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.876504e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.683159e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.241030e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.033957e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.934736e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.708340e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.196667e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.130631e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.594732e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 161c62cc9b..41d31a1a79 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-30_06:09:46 +DATE: 2024-01-31_15:11:59 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6495s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6408s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4682s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4623s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s - [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1412s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1353s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4421s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3487s - [COUNTERS] Fortran MEs ( 1 ) : 0.0934s for 90112 events => throughput is 9.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3253s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2628s + [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165804194701] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1909s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1840s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1463s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1411s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.59E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165492032638) differ by less than 4E-4 (1.6428111293542713e-07) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747165804194701) differ by less than 4E-4 (1.4992696639737346e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501906417651019E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4341s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3569s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0773s for 90112 events => throughput is 1.17E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0565s for 90112 events => throughput is 1.59E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905274264717E-002) differ by less than 4E-4 (1.5989335488963974e-07) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501906417651019E-002) differ by less than 4E-4 (1.473975921317816e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.185144e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.648536e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.215006e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.666313e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747170102104563] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1811s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1786s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1412s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1390s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.85E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165570339780) differ by less than 4E-4 (1.6068031594151932e-07) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747170102104563) differ by less than 4E-4 (4.770380779284267e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501924220365086E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3824s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3531s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0293s for 90112 events => throughput is 3.08E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2893s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.87E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263464411127e-07) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501924220365086E-002) differ by less than 4E-4 (4.716350665567859e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.133589e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.091663e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.329458e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.169619e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747170107722058] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1817s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1793s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1409s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.62E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747170107722058) differ by less than 4E-4 (4.7962117166733265e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501924223714337E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0262s for 90112 events => throughput is 3.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2648s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 90112 events => throughput is 4.64E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501924223714337E-002) differ by less than 4E-4 (4.7200109598577455e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.621496e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.716307e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1846s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1824s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.63E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3795s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 90112 events => throughput is 3.63E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.744053e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.839743e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.971296e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1822s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1798s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.36E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166446533123) differ by less than 4E-4 (1.2039032049049325e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.040701e+06 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3805s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3541s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 90112 events => throughput is 3.42E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501908990866423E-002) differ by less than 4E-4 (1.1927560927826875e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.615840e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.903927e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747166473699148] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6073s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6068s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4193s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.83E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166823487174) differ by less than 4E-4 (1.0305684361444634e-07) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747166473699148) differ by less than 4E-4 (1.191411457268643e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501909133729534E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7841s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7794s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.89E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5481s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5450s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 2.97E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439961927435e-07) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501909133729534E-002) differ by less than 4E-4 (1.1771429675455636e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.032746e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.685608e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.810870e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.836295e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.874936e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.323104e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.028452e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.462917e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.891915e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.314000e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.234607e+09 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.463292e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.256002e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.079480e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.441320e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.317614e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index f51b70af46..b6803d0924 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-30_06:10:03 +DATE: 2024-01-31_15:12:19 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6504s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6416s - [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4683s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4624s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1846s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1761s - [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1407s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1349s + [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4424s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3488s - [COUNTERS] Fortran MEs ( 1 ) : 0.0936s for 90112 events => throughput is 9.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3253s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2628s + [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211725] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1932s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1857s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1503s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1443s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211736) differ by less than 2E-4 (4.3821613004979554e-10) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169074211725) differ by less than 2E-4 (4.382150198267709e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919915927141E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4421s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3600s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 90112 events => throughput is 1.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3366s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0654s for 90112 events => throughput is 1.38E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919915927141E-002) differ by less than 2E-4 (1.2145595640333795e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.101615e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.409008e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.125271e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.431496e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211722] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1841s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1440s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.37E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211734) differ by less than 2E-4 (4.382159080051906e-10) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169074211722) differ by less than 2E-4 (4.382150198267709e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919915927141E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4024s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3561s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 90112 events => throughput is 1.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3065s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2686s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0379s for 90112 events => throughput is 2.38E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919915927141E-002) differ by less than 2E-4 (1.2145595640333795e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.982524e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.495068e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.072335e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.513199e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169063975919] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1429s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1404s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.21E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169063975919) differ by less than 2E-4 (3.2457925236428764e-11) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919908700713E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3912s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3545s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 90112 events => throughput is 2.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3018s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2736s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.20E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919908700713E-002) differ by less than 2E-4 (4.248024154662744e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.431608e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.633191e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3888s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 90112 events => throughput is 2.63E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.754832e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.286014e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.841477e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 - [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1840s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.421261e+06 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3972s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3574s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 90112 events => throughput is 2.26E+06 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.333320e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.456277e+06 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169066587291] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6100s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6095s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.62E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.4169s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4165s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.01E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169066587257) differ by less than 2E-4 (8.761968928183705e-11) +OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169066587291) differ by less than 2E-4 (8.762079950486168e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 4/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 + [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919911173651E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7838s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.74E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5500s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5456s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.03E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) +OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919911173651E-002) differ by less than 2E-4 (6.950640063507763e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.926990e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.163304e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.883911e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.502888e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.714682e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.287328e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.463238e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.861330e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.709935e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.282882e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.999719e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.943867e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.716961e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.187137e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.154425e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.597298e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 6a2d60f404..b3de38a77a 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-30_06:10:22 +DATE: 2024-01-31_15:12:39 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.4078s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3635s - [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3466s + [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3357s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s - [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2547s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2263s + [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8736s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3937s - [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3324s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0216s + [COUNTERS] Fortran MEs ( 1 ) : 0.3108s for 90112 events => throughput is 2.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3701s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3307s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2945s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2625s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775350] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8532s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4213s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4319s for 90112 events => throughput is 2.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4059s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3524s for 90112 events => throughput is 2.56E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775350) differ by less than 2E-14 (7.771561172376096e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.120291e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.576965e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.118287e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.601266e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3362s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3135s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 8192 events => throughput is 3.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2494s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6510s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4023s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2487s for 90112 events => throughput is 3.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2344s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0399s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1945s for 90112 events => throughput is 4.63E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775372) differ by less than 2E-14 (3.3306690738754696e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.657266e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.737407e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.745669e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.749563e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3046s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2506s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2404s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 8.03E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5517s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1589s for 90112 events => throughput is 5.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1438s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0314s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1124s for 90112 events => throughput is 8.02E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775372) differ by less than 2E-14 (3.3306690738754696e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.690369e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.267343e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.998122e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.64E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5235s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 90112 events => throughput is 6.64E+05 events/s +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.309996e+05 ) sec^-1 -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.740357e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.934348e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3284s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6116s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3942s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2174s for 90112 events => throughput is 4.15E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.190038e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.219873e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7212s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7206s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.39E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5148s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5141s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8076s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8005s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 90112 events => throughput is 1.27E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3135s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3056s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0079s for 90112 events => throughput is 1.15E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775379) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.036838e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.274881e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.660860e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.756760e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.989378e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.762850e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.069849e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.751326e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.996952e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.777828e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.150687e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.947188e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.991689e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.738816e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.999027e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161420e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index fe11b37e1c..787d8bcbcc 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-30_06:10:49 +DATE: 2024-01-31_15:13:04 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3836s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3391s - [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2988s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2704s + [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3381s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s - [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2566s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2282s + [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8752s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3952s - [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3327s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0218s + [COUNTERS] Fortran MEs ( 1 ) : 0.3109s for 90112 events => throughput is 2.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690704859565422] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3627s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2856s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703999052587) differ by less than 4E-4 (8.971448917094449e-08) +OK! xsec from fortran (47.690708277600123) and cpp (47.690704859565422) differ by less than 4E-4 (7.167087312520692e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780988783801] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8092s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4081s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4011s for 90112 events => throughput is 2.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3514s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0486s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3028s for 90112 events => throughput is 2.98E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223780103711483) differ by less than 4E-4 (4.733632297249102e-08) +OK! xsec from fortran (46.223782291775386) and cpp (46.223780988783801) differ by less than 4E-4 (2.8188770428982934e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.286506e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.092595e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.292834e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.100516e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703261737923] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3194s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3043s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2593s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2465s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.38E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690699958440689) differ by less than 4E-4 (1.744398380187917e-07) +OK! xsec from fortran (47.690708277600123) and cpp (47.690703261737923) differ by less than 4E-4 (1.0517483139960149e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223779141681696] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5555s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3898s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1657s for 90112 events => throughput is 5.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1796s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0385s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1411s for 90112 events => throughput is 6.38E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223776162337749) differ by less than 4E-4 (1.326035499182865e-07) +OK! xsec from fortran (46.223782291775386) and cpp (46.223779141681696) differ by less than 4E-4 (6.814876529759317e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.487444e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.726889e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.523247e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.730261e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690694815027804] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3066s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2985s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.2435s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) +OK! xsec from fortran (47.690708277600123) and cpp (47.690694815027804) differ by less than 4E-4 (2.8228920900819077e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223776468660184] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4802s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.0950s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0682s for 90112 events => throughput is 1.32E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) +OK! xsec from fortran (46.223782291775386) and cpp (46.223776468660184) differ by less than 4E-4 (1.2597660581370462e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.007163e+06 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.012972e+06 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3042s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4699s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3862s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 90112 events => throughput is 1.08E+06 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.018834e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.370439e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.101159e+06 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3134s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3023s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.33E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690698822141186) differ by less than 4E-4 (1.982662718447159e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.381975e+06 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5088s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3876s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1212s for 90112 events => throughput is 7.43E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223780266165058) differ by less than 4E-4 (4.382182106077437e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.590700e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.687831e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690697792016230] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7336s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7331s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.34E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697987) differ by less than 4E-4 (1.0232396008280631e-07) +OK! xsec from fortran (47.690708277600123) and cpp (47.690697792016230) differ by less than 4E-4 (2.198663905383924e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223779043453305] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8220s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8158s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 90112 events => throughput is 1.47E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3113s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3073s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 90112 events => throughput is 2.26E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376677454826e-08) +OK! xsec from fortran (46.223782291775386) and cpp (46.223779043453305) differ by less than 4E-4 (7.027382697977202e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.211265e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.948491e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.993599e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.013424e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.733080e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.066147e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.767769e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.908417e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.726692e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.065652e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.882266e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.013454e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.370639e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.070656e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.407782e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.176885e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index a855e5b8c2..55dc817a38 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-30_06:11:16 +DATE: 2024-01-31_15:13:29 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3775s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3336s - [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2863s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s + [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3333s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s - [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2590s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2305s + [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8813s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4017s - [COUNTERS] Fortran MEs ( 1 ) : 0.4796s for 90112 events => throughput is 1.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3358s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0250s + [COUNTERS] Fortran MEs ( 1 ) : 0.3108s for 90112 events => throughput is 2.90E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709601032033] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 8192 events => throughput is 2.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2968s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2636s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 8192 events => throughput is 2.47E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032019) differ by less than 2E-4 (2.77503091616893e-08) +OK! xsec from fortran (47.690708277600123) and cpp (47.690709601032033) differ by less than 2E-4 (2.7750309383733907e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783635280981] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8620s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4222s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4398s for 90112 events => throughput is 2.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0523s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3627s for 90112 events => throughput is 2.48E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) +OK! xsec from fortran (46.223782291775386) and cpp (46.223783635280981) differ by less than 2E-4 (2.906524576573588e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.094601e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.551566e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.081772e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.553689e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709601032033] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3338s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3117s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2651s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2478s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032026) differ by less than 2E-4 (2.7750309383733907e-08) +OK! xsec from fortran (47.690708277600123) and cpp (47.690709601032033) differ by less than 2E-4 (2.7750309383733907e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6431s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3996s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2435s for 90112 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2268s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0370s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1898s for 90112 events => throughput is 4.75E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) +OK! xsec from fortran (46.223782291775386) and cpp (46.223783635280974) differ by less than 2E-4 (2.906524576573588e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699859e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.772802e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728578e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.772941e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441529] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3207s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3066s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.81E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2517s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2417s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.17E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) +OK! xsec from fortran (47.690708277600123) and cpp (47.690709643441529) differ by less than 2E-4 (2.8639570492927646e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238837] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5524s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1572s for 90112 events => throughput is 5.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.1412s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0313s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1099s for 90112 events => throughput is 8.20E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) +OK! xsec from fortran (46.223782291775386) and cpp (46.223783660238837) differ by less than 2E-4 (2.9605181195435648e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.934952e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.452588e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.894708e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3189s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3068s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.74E+05 events/s +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.508771e+05 ) sec^-1 -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5411s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4039s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1372s for 90112 events => throughput is 6.57E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.847513e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.835511e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 - [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3300s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/16 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 - [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6128s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4020s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2108s for 90112 events => throughput is 4.27E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.284904e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.343219e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708266690727] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7257s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7251s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.5142s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690706) differ by less than 2E-4 (2.2875334959593374e-10) +OK! xsec from fortran (47.690708277600123) and cpp (47.690708266690727) differ by less than 2E-4 (2.2875312755132882e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782303744805] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.8142s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8072s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.3130s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3053s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) +OK! xsec from fortran (46.223782291775386) and cpp (46.223782303744805) differ by less than 2E-4 (2.5894508759449764e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.009523e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.208836e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.569379e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.819798e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.989648e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.804520e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.069002e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.803586e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.991680e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.807496e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.142771e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.992431e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.993358e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.769915e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.008591e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.182699e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index ad1d0f839b..9533845a25 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:11:43 +DATE: 2024-01-31_15:13:54 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5934s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s - [COUNTERS] Fortran MEs ( 1 ) : 0.3485s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4630s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2594s + [COUNTERS] Fortran MEs ( 1 ) : 0.2036s for 8192 events => throughput is 4.02E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5871s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2379s - [COUNTERS] Fortran MEs ( 1 ) : 0.3492s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3970s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1933s + [COUNTERS] Fortran MEs ( 1 ) : 0.2036s for 8192 events => throughput is 4.02E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310271073909590E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3849s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5565s - [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.3928s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1576s + [COUNTERS] Fortran MEs ( 1 ) : 2.2352s for 90112 events => throughput is 4.03E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196702725954626E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.9218s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5788s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3431s for 8192 events => throughput is 2.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7532s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2822s for 8192 events => throughput is 2.90E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954626E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271073909604E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.7045s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9313s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7731s for 90112 events => throughput is 2.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.5342s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4286s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.1055s for 90112 events => throughput is 2.90E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909604E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.459637e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.974124e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.457759e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.990607e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196702725954598E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5930s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4166s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1764s for 8192 events => throughput is 4.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4654s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3301s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1353s for 8192 events => throughput is 6.06E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954598E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271073909618E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.7266s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7761s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9506s for 90112 events => throughput is 4.62E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7790s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2917s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4873s for 90112 events => throughput is 6.06E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909618E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.735934e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.140787e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.731299e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.147421e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4266s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3331s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0935s for 8192 events => throughput is 8.76E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3276s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2608s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0668s for 8192 events => throughput is 1.23E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954640E-002) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271073909576E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6908s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6878s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0029s for 90112 events => throughput is 8.98E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.9583s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7358s for 90112 events => throughput is 1.22E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909576E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.260876e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.205229e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4089s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0830s for 8192 events => throughput is 9.87E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.5356s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6675s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8681s for 90112 events => throughput is 1.04E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066040e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.245428e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.071802e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4633s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1112s for 8192 events => throughput is 7.37E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.274058e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.9267s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7103s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2164s for 90112 events => throughput is 7.41E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.521586e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.533151e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196702725954653E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6854s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6800s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5093s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.06E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470764E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954653E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271073909604E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0439s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0206s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.5423s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4570s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0852s for 90112 events => throughput is 1.06E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655610E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909604E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.630499e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099505e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.083902e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.136233e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.662154e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.679574e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243596e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.301174e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.668083e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.679806e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.255740e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.846610e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.633959e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.661806e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.773665e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.486117e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index c17be1788d..e544f39758 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:12:28 +DATE: 2024-01-31_15:14:33 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5874s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s - [COUNTERS] Fortran MEs ( 1 ) : 0.3480s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4007s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1973s + [COUNTERS] Fortran MEs ( 1 ) : 0.2035s for 8192 events => throughput is 4.03E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5918s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2437s - [COUNTERS] Fortran MEs ( 1 ) : 0.3481s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4001s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1965s + [COUNTERS] Fortran MEs ( 1 ) : 0.2036s for 8192 events => throughput is 4.02E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310271073909590E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3901s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5607s - [COUNTERS] Fortran MEs ( 1 ) : 3.8294s for 90112 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4016s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1641s + [COUNTERS] Fortran MEs ( 1 ) : 2.2376s for 90112 events => throughput is 4.03E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196694166750697E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8754s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5552s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3203s for 8192 events => throughput is 2.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7038s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4463s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2575s for 8192 events => throughput is 3.18E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347758884971E-002) differ by less than 4E-4 (1.0456755794585604e-07) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196694166750697E-002) differ by less than 4E-4 (8.806064100141953e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310258386649639E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.4534s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9123s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5411s for 90112 events => throughput is 2.54E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.2441s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4085s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.8356s for 90112 events => throughput is 3.18E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310858119443913E-002) differ by less than 4E-4 (1.7166476384833373e-07) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310258386649639E-002) differ by less than 4E-4 (1.5603514513795602e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.651171e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.298484e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.640512e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.293545e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196680760393742E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4354s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3378s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0976s for 8192 events => throughput is 8.39E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3535s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2781s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 8192 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196323434217816E-002) differ by less than 4E-4 (3.548307125900152e-07) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196680760393742E-002) differ by less than 4E-4 (2.2599080296004104e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310249885719388E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7743s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6927s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0816s for 90112 events => throughput is 8.33E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0647s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2349s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8298s for 90112 events => throughput is 1.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842598054087E-002) differ by less than 4E-4 (3.625542406293647e-07) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310249885719388E-002) differ by less than 4E-4 (2.6058442459397924e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.607319e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.107341e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.623071e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.107225e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196679618405488E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3345s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2882s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2659s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2315s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0344s for 8192 events => throughput is 2.38E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196679618405488E-002) differ by less than 4E-4 (2.3774005186716352e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310249280068872E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.1519s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5128s for 90112 events => throughput is 1.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5665s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1912s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 90112 events => throughput is 2.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310249280068872E-002) differ by less than 4E-4 (2.680330594140301e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.801449e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.824776e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0864s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6356s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4508s for 90112 events => throughput is 2.00E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.066287e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.454107e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.073001e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3518s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0546s for 8192 events => throughput is 1.50E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344080460087E-002) differ by less than 4E-4 (1.4241285339888776e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.462442e+05 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.2586s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6507s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6080s for 90112 events => throughput is 1.48E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310857813116089E-002) differ by less than 4E-4 (1.754321300451167e-07) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.497722e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.492408e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196692039411392E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6742s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.66E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4883s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4862s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.93E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366366022E-002) differ by less than 4E-4 (8.802906736882221e-08) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196692039411392E-002) differ by less than 4E-4 (1.0994759025440004e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310258751737655E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0322s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0221s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 90112 events => throughput is 8.88E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4714s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4486s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310864949473954E-002) differ by less than 4E-4 (8.766578729613173e-08) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310258751737655E-002) differ by less than 4E-4 (1.5154508492543073e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.288695e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.587112e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.864373e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.268423e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.630957e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.466901e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.365812e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.085456e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.633946e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.466932e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.471906e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.636891e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.509519e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.418919e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624050e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.306800e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index daa5ca9a3d..fb1abbbf81 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:13:08 +DATE: 2024-01-31_15:15:08 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5921s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2413s - [COUNTERS] Fortran MEs ( 1 ) : 0.3507s for 8192 events => throughput is 2.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1966s + [COUNTERS] Fortran MEs ( 1 ) : 0.2037s for 8192 events => throughput is 4.02E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2393s - [COUNTERS] Fortran MEs ( 1 ) : 0.3482s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4213s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2174s + [COUNTERS] Fortran MEs ( 1 ) : 0.2039s for 8192 events => throughput is 4.02E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310271073909590E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.3863s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5579s - [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.3970s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1592s + [COUNTERS] Fortran MEs ( 1 ) : 2.2378s for 90112 events => throughput is 4.03E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196703561337638E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.9324s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3490s for 8192 events => throughput is 2.35E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.7658s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4782s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2876s for 8192 events => throughput is 2.85E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358763382021E-002) differ by less than 2E-4 (8.651674487936134e-09) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196703561337638E-002) differ by less than 2E-4 (8.594766898184503e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271828760453E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.7762s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9305s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8457s for 90112 events => throughput is 2.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.6035s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4383s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.1652s for 90112 events => throughput is 2.85E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.314319981967856e-09) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271828760453E-002) differ by less than 2E-4 (9.283585677977158e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.405521e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.911261e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.412039e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.920789e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196703601584347E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5895s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4134s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1762s for 8192 events => throughput is 4.65E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4641s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1358s for 8192 events => throughput is 6.03E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358804670424E-002) differ by less than 2E-4 (9.076468243662816e-09) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196703601584347E-002) differ by less than 2E-4 (9.008841672653034e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271831113598E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.7571s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7661s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9910s for 90112 events => throughput is 4.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7822s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2904s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.4918s for 90112 events => throughput is 6.04E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.33619492826665e-09) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271831113598E-002) differ by less than 2E-4 (9.312525861560061e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.812264e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.206115e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.790866e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.194509e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196703386139241E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4175s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0887s for 8192 events => throughput is 9.23E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3267s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2609s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0658s for 8192 events => throughput is 1.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196703386139241E-002) differ by less than 2E-4 (6.792252982279479e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271701289558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6655s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6760s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9895s for 90112 events => throughput is 9.11E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.9463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2225s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7238s for 90112 events => throughput is 1.24E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271701289558E-002) differ by less than 2E-4 (7.715875938174577e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.369009e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.305282e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3954s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.5272s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6689s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8582s for 90112 events => throughput is 1.05E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086568e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.278290e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.099441e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.278214e+05 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4677s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1134s for 8192 events => throughput is 7.22E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 32/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 - [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.9514s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7052s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2463s for 90112 events => throughput is 7.23E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.364043e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.347069e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196702904173926E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6813s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6759s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.5030s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4952s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981231E-002) differ by less than 2E-4 (1.8571730819871846e-09) +OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702904173926E-002) differ by less than 2E-4 (1.8335939433455906e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310271062722053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0449s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0215s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.86E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.5406s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4551s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0855s for 90112 events => throughput is 1.05E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634160E-002) differ by less than 2E-4 (1.109495828544027e-10) +OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271062722053E-002) differ by less than 2E-4 (1.375907165979129e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624283e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099800e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.218126e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.115814e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.599538e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.678377e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232652e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.299070e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.616286e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.677334e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243703e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.848766e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.609732e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.659540e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728637e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.479459e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 930476d789..5ebf048e8c 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:13:52 +DATE: 2024-01-31_15:15:47 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.7127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3035s - [COUNTERS] Fortran MEs ( 1 ) : 4.4093s for 8192 events => throughput is 1.86E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.8125s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] Fortran MEs ( 1 ) : 2.4995s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2934s - [COUNTERS] Fortran MEs ( 1 ) : 4.4261s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7290s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2303s + [COUNTERS] Fortran MEs ( 1 ) : 2.4987s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 50.8146s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0980s - [COUNTERS] Fortran MEs ( 1 ) : 48.7166s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 29.0084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4985s + [COUNTERS] Fortran MEs ( 1 ) : 27.5099s for 90112 events => throughput is 3.28E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295813310842E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.5114s - [COUNTERS] Fortran Overhead ( 0 ) : 4.8392s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6722s for 8192 events => throughput is 1.75E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.6700s + [COUNTERS] Fortran Overhead ( 0 ) : 3.8930s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7770s for 8192 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310842E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 57.8440s - [COUNTERS] Fortran Overhead ( 0 ) : 6.6165s - [COUNTERS] CudaCpp MEs ( 2 ) : 51.2275s for 90112 events => throughput is 1.76E+03 events/s + [COUNTERS] PROGRAM TOTAL : 46.7287s + [COUNTERS] Fortran Overhead ( 0 ) : 5.1782s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.5505s for 90112 events => throughput is 2.17E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421150E-004) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587238E-004) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.804400e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.244212e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.805886e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.246522e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295813310842E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 5.0600s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6542s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.4058s for 8192 events => throughput is 3.41E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5212s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8575s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6637s for 8192 events => throughput is 4.92E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310842E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455880587233E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 30.6354s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3949s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2405s for 90112 events => throughput is 3.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 21.4014s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1434s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.2579s for 90112 events => throughput is 4.94E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421156E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587233E-004) differ by less than 2E-14 (3.3306690738754696e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.603718e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.075868e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.615187e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.072770e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295813310837E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.3469s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3045s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0424s for 8192 events => throughput is 7.86E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.5929s + [COUNTERS] Fortran Overhead ( 0 ) : 0.9056s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6873s for 8192 events => throughput is 1.19E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310837E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455880587241E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.6028s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0873s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.5154s for 90112 events => throughput is 7.83E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.7834s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1783s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.6051s for 90112 events => throughput is 1.18E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587241E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.099934e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.056733e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.1181s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1916s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9265s for 8192 events => throughput is 8.84E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.1007s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9669s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.1338s for 90112 events => throughput is 8.89E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.168199e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.219218e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.198606e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.6312s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4569s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1743s for 8192 events => throughput is 6.98E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.219478e+04 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 16.1783s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2363s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.9420s for 90112 events => throughput is 6.96E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.039911e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.063323e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295813310831E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8345s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8403s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7248s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1155s for 8192 events => throughput is 7.09E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310831E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.9375s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5725s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3650s for 90112 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 3.2629s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9956s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2672s for 90112 events => throughput is 7.11E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421166E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587238E-004) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.275540e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.179211e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.510631e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.457940e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.114953e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.240898e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.167963e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.033520e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.105704e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.233836e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.168745e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.225572e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.099883e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.240859e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425014e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.392782e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 5e8ad575df..223f61cd19 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:18:23 +DATE: 2024-01-31_15:19:35 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.7226s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2990s - [COUNTERS] Fortran MEs ( 1 ) : 4.4236s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7309s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2329s + [COUNTERS] Fortran MEs ( 1 ) : 2.4980s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s - [COUNTERS] Fortran MEs ( 1 ) : 4.4253s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7301s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2310s + [COUNTERS] Fortran MEs ( 1 ) : 2.4991s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 50.8315s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1047s - [COUNTERS] Fortran MEs ( 1 ) : 48.7267s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.9804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4991s + [COUNTERS] Fortran MEs ( 1 ) : 27.4813s for 90112 events => throughput is 3.28E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287415397046849E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.7124s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4408s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2715s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 6.9110s + [COUNTERS] Fortran Overhead ( 0 ) : 3.5360s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.3750s for 8192 events => throughput is 2.43E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396352122325E-004) differ by less than 4E-4 (3.2814141017745158e-06) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287415397046849E-004) differ by less than 4E-4 (3.295471137976236e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803504352744863E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 53.4678s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2349s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.2329s for 90112 events => throughput is 1.91E+03 events/s + [COUNTERS] PROGRAM TOTAL : 41.9127s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8008s + [COUNTERS] CudaCpp MEs ( 2 ) : 37.1120s for 90112 events => throughput is 2.43E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774048965294E-004) differ by less than 4E-4 (3.056275773571926e-06) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803504352744863E-004) differ by less than 4E-4 (3.0671872019993884e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.973797e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.486541e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.974372e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.486031e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287408821488631E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.6573s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4642s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1930s for 8192 events => throughput is 6.87E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.8917s + [COUNTERS] Fortran Overhead ( 0 ) : 1.0538s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8378s for 8192 events => throughput is 9.78E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277387698033752E-004) differ by less than 4E-4 (3.0428601303089664e-06) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287408821488631E-004) differ by less than 4E-4 (3.1142628647007342e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803502293058018E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 16.3220s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2267s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.0952s for 90112 events => throughput is 6.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.5963s + [COUNTERS] Fortran Overhead ( 0 ) : 2.3323s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.2641s for 90112 events => throughput is 9.73E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803770691658365E-004) differ by less than 4E-4 (2.8438380874629132e-06) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803502293058018E-004) differ by less than 4E-4 (2.936855782120773e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.126754e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.977944e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.170734e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.003051e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287410019363280E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3306s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8070s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5236s for 8192 events => throughput is 1.56E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9306s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3532s for 8192 events => throughput is 2.32E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287410019363280E-004) differ by less than 4E-4 (3.147273719417143e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803505046780143E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.3793s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5829s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.7964s for 90112 events => throughput is 1.55E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7205s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8432s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8773s for 90112 events => throughput is 2.32E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803505046780143E-004) differ by less than 4E-4 (3.111103879849253e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.587083e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.594558e+04 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2103s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7457s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4646s for 8192 events => throughput is 1.76E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.6352s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5218s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.1134s for 90112 events => throughput is 1.76E+04 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.751909e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.403600e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.760230e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.397579e+04 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.4750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8777s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5973s for 8192 events => throughput is 1.37E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396133530942E-004) differ by less than 4E-4 (3.2753885288450135e-06) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 9.1796s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7011s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.4785s for 90112 events => throughput is 1.39E+04 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803777739454609E-004) differ by less than 4E-4 (3.2897959809652377e-06) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.411903e+04 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.410632e+04 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287414523737644E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7967s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7754s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7020s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6445s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0575s for 8192 events => throughput is 1.43E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277400478491265E-004) differ by less than 4E-4 (3.395159378305479e-06) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287414523737644E-004) differ by less than 4E-4 (3.271404610538653e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803508418967395E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.7835s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5473s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5342s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9016s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6326s for 90112 events => throughput is 1.42E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.432211783227501e-06) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803508418967395E-004) differ by less than 4E-4 (3.324486780309499e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.582485e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.439273e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.942798e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.523220e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.492976e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.697708e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.638150e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.275257e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.493239e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.699598e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.638925e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.973303e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.453709e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.680513e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.527726e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.315750e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index a372850ebe..9c049e812e 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' - - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:21:53 +DATE: 2024-01-31_15:22:43 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.7321s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3006s - [COUNTERS] Fortran MEs ( 1 ) : 4.4315s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7330s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s + [COUNTERS] Fortran MEs ( 1 ) : 2.4986s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7365s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s - [COUNTERS] Fortran MEs ( 1 ) : 4.4410s for 8192 events => throughput is 1.84E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.7304s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s + [COUNTERS] Fortran MEs ( 1 ) : 2.4984s for 8192 events => throughput is 3.28E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 50.8810s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1085s - [COUNTERS] Fortran MEs ( 1 ) : 48.7725s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.9955s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4995s + [COUNTERS] Fortran MEs ( 1 ) : 27.4960s for 90112 events => throughput is 3.28E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295930626011E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.6340s - [COUNTERS] Fortran Overhead ( 0 ) : 4.9027s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.7313s for 8192 events => throughput is 1.73E+03 events/s + [COUNTERS] PROGRAM TOTAL : 7.7609s + [COUNTERS] Fortran Overhead ( 0 ) : 3.9574s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8034s for 8192 events => throughput is 2.15E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277432965013E-004) differ by less than 2E-4 (3.352291999547674e-09) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295930626011E-004) differ by less than 2E-4 (3.232954792764531e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455945000286E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 58.7075s - [COUNTERS] Fortran Overhead ( 0 ) : 6.6716s - [COUNTERS] CudaCpp MEs ( 2 ) : 52.0359s for 90112 events => throughput is 1.73E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.1027s + [COUNTERS] Fortran Overhead ( 0 ) : 5.2266s + [COUNTERS] CudaCpp MEs ( 2 ) : 41.8761s for 90112 events => throughput is 2.15E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725813026107E-004) differ by less than 2E-4 (4.087956861908992e-09) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455945000286E-004) differ by less than 2E-4 (4.075883630605404e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.784786e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.202379e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.785841e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.202036e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295929360709E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 5.0010s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6173s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3837s for 8192 events => throughput is 3.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8371s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.6417s for 8192 events => throughput is 4.99E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277430934459E-004) differ by less than 2E-4 (3.296318995538172e-09) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295929360709E-004) differ by less than 2E-4 (3.1980857961855236e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455948191442E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 30.6721s - [COUNTERS] Fortran Overhead ( 0 ) : 4.4448s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.2273s for 90112 events => throughput is 3.44E+03 events/s + [COUNTERS] PROGRAM TOTAL : 21.1084s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1042s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.0042s for 90112 events => throughput is 5.01E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725816246315E-004) differ by less than 2E-4 (4.291719424287521e-09) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455948191442E-004) differ by less than 2E-4 (4.277811438413437e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.519557e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.131114e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.539213e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.127519e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295916873888E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.3362s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3065s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0298s for 8192 events => throughput is 7.96E+03 events/s + [COUNTERS] PROGRAM TOTAL : 1.5800s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8982s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6819s for 8192 events => throughput is 1.20E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295916873888E-004) differ by less than 2E-4 (2.85397594446124e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455942593628E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 14.5206s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0822s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.4384s for 90112 events => throughput is 7.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.6911s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1649s + [COUNTERS] CudaCpp MEs ( 2 ) : 7.5262s for 90112 events => throughput is 1.20E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455942593628E-004) differ by less than 2E-4 (3.923596780808225e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.081577e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.110336e+03 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0953s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1807s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9146s for 8192 events => throughput is 8.96E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.0092s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9623s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.0469s for 90112 events => throughput is 8.97E+03 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.305280e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.236690e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.278381e+03 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.6707s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4755s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1952s for 8192 events => throughput is 6.85E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.234370e+04 ) sec^-1 -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 64/64 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 - [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 16.3107s - [COUNTERS] Fortran Overhead ( 0 ) : 3.2346s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.0761s for 90112 events => throughput is 6.89E+03 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.979300e+03 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.981216e+03 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003629 [3.6287295792920187E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8344s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8446s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7292s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1153s for 8192 events => throughput is 7.10E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084701E-004) differ by less than 2E-4 (5.03573627241849e-10) +OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295792920187E-004) differ by less than 2E-4 (5.619220644348388e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803455870960301E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.9283s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5651s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3632s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 3.2641s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9918s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2723s for 90112 events => throughput is 7.08E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131540830622839e-10) +OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455870960301E-004) differ by less than 2E-4 (6.091666060470402e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.286633e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.186070e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.522365e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.461379e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.122442e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243473e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.148040e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.016222e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.112221e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.240703e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.164785e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.229491e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.108117e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241630e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430780e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.381724e+04 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index bc47a109df..fac9a60d3b 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-30_06:27:59 +DATE: 2024-01-31_15:27:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 101.9143s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s - [COUNTERS] Fortran MEs ( 1 ) : 101.4320s for 8192 events => throughput is 8.08E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.8219s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4154s + [COUNTERS] Fortran MEs ( 1 ) : 54.4066s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 101.8572s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4740s - [COUNTERS] Fortran MEs ( 1 ) : 101.3832s for 8192 events => throughput is 8.08E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.7889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3599s + [COUNTERS] Fortran MEs ( 1 ) : 54.4290s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1118.0575s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3730s - [COUNTERS] Fortran MEs ( 1 ) : 1113.6844s for 90112 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 600.8898s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9391s + [COUNTERS] Fortran MEs ( 1 ) : 597.9506s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747608E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 222.3358s - [COUNTERS] Fortran Overhead ( 0 ) : 102.7450s - [COUNTERS] CudaCpp MEs ( 2 ) : 119.5908s for 8192 events => throughput is 6.85E+01 events/s + [COUNTERS] PROGRAM TOTAL : 174.2219s + [COUNTERS] Fortran Overhead ( 0 ) : 79.6854s + [COUNTERS] CudaCpp MEs ( 2 ) : 94.5365s for 8192 events => throughput is 8.67E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747608E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913627E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1439.6055s - [COUNTERS] Fortran Overhead ( 0 ) : 107.7197s - [COUNTERS] CudaCpp MEs ( 2 ) : 1331.8857s for 90112 events => throughput is 6.77E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1125.3822s + [COUNTERS] Fortran Overhead ( 0 ) : 82.5529s + [COUNTERS] CudaCpp MEs ( 2 ) : 1042.8293s for 90112 events => throughput is 8.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813950E-007) differ by less than 2E-14 (1.3322676295501878e-15) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913627E-007) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.948640e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.034374e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.570768e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.030138e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747610E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 115.4155s - [COUNTERS] Fortran Overhead ( 0 ) : 52.8722s - [COUNTERS] CudaCpp MEs ( 2 ) : 62.5433s for 8192 events => throughput is 1.31E+02 events/s + [COUNTERS] PROGRAM TOTAL : 81.7577s + [COUNTERS] Fortran Overhead ( 0 ) : 36.8116s + [COUNTERS] CudaCpp MEs ( 2 ) : 44.9462s for 8192 events => throughput is 1.82E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747610E-006) differ by less than 2E-14 (2.4424906541753444e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913637E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 742.6112s - [COUNTERS] Fortran Overhead ( 0 ) : 56.7177s - [COUNTERS] CudaCpp MEs ( 2 ) : 685.8936s for 90112 events => throughput is 1.31E+02 events/s + [COUNTERS] PROGRAM TOTAL : 534.2423s + [COUNTERS] Fortran Overhead ( 0 ) : 39.3232s + [COUNTERS] CudaCpp MEs ( 2 ) : 494.9191s for 90112 events => throughput is 1.82E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913637E-007) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.569503e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.255815e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568080e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.262018e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747600E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 53.0772s - [COUNTERS] Fortran Overhead ( 0 ) : 24.7915s - [COUNTERS] CudaCpp MEs ( 2 ) : 28.2857s for 8192 events => throughput is 2.90E+02 events/s + [COUNTERS] PROGRAM TOTAL : 34.9158s + [COUNTERS] Fortran Overhead ( 0 ) : 15.8355s + [COUNTERS] CudaCpp MEs ( 2 ) : 19.0803s for 8192 events => throughput is 4.29E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747600E-006) differ by less than 2E-14 (1.5543122344752192e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 340.2848s - [COUNTERS] Fortran Overhead ( 0 ) : 28.6537s - [COUNTERS] CudaCpp MEs ( 2 ) : 311.6311s for 90112 events => throughput is 2.89E+02 events/s + [COUNTERS] PROGRAM TOTAL : 228.0263s + [COUNTERS] Fortran Overhead ( 0 ) : 18.3642s + [COUNTERS] CudaCpp MEs ( 2 ) : 209.6621s for 90112 events => throughput is 4.30E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913632E-007) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.398486e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.399363e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 46.8647s - [COUNTERS] Fortran Overhead ( 0 ) : 21.6683s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.1965s for 8192 events => throughput is 3.25E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 302.3706s - [COUNTERS] Fortran Overhead ( 0 ) : 25.4811s - [COUNTERS] CudaCpp MEs ( 2 ) : 276.8895s for 90112 events => throughput is 3.25E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.882449e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.298576e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.889860e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.0231s - [COUNTERS] Fortran Overhead ( 0 ) : 24.6335s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.3895s for 8192 events => throughput is 3.23E+02 events/s +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.317212e+02 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 308.5181s - [COUNTERS] Fortran Overhead ( 0 ) : 28.4720s - [COUNTERS] CudaCpp MEs ( 2 ) : 280.0461s for 90112 events => throughput is 3.22E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.386684e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.385729e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747604E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 4.2467s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1625s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0842s for 8192 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.6796s + [COUNTERS] Fortran Overhead ( 0 ) : 7.7273s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.9522s for 8192 events => throughput is 2.07E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (1.9984014443252818e-15) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747604E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 18.9518s - [COUNTERS] Fortran Overhead ( 0 ) : 7.0338s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.9179s for 90112 events => throughput is 7.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.7849s + [COUNTERS] Fortran Overhead ( 0 ) : 10.3304s + [COUNTERS] CudaCpp MEs ( 2 ) : 43.4545s for 90112 events => throughput is 2.07E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813960E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913632E-007) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.528868e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.097186e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.249701e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.160043e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.231891e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.470954e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.557033e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.436635e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.244700e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.490950e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.446855e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.435708e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.214530e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.482049e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.244468e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.114797e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index c35aa0a017..e876937058 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' + make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-30_07:59:04 +DATE: 2024-01-31_16:22:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 101.8466s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4779s - [COUNTERS] Fortran MEs ( 1 ) : 101.3687s for 8192 events => throughput is 8.08E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.7916s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s + [COUNTERS] Fortran MEs ( 1 ) : 54.4478s for 8192 events => throughput is 1.50E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 101.7818s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4752s - [COUNTERS] Fortran MEs ( 1 ) : 101.3066s for 8192 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.7291s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3431s + [COUNTERS] Fortran MEs ( 1 ) : 54.3860s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1118.6550s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3767s - [COUNTERS] Fortran MEs ( 1 ) : 1114.2783s for 90112 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 601.1562s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9495s + [COUNTERS] Fortran MEs ( 1 ) : 598.2067s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694234561464155E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 205.5797s - [COUNTERS] Fortran Overhead ( 0 ) : 95.6059s - [COUNTERS] CudaCpp MEs ( 2 ) : 109.9738s for 8192 events => throughput is 7.45E+01 events/s + [COUNTERS] PROGRAM TOTAL : 162.5170s + [COUNTERS] Fortran Overhead ( 0 ) : 74.5337s + [COUNTERS] CudaCpp MEs ( 2 ) : 87.9834s for 8192 events => throughput is 9.31E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768374083672E-006) differ by less than 4E-4 (0.00014259935458071915) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694234561464155E-006) differ by less than 4E-4 (0.00014258653941601196) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361347375693870E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1305.3326s - [COUNTERS] Fortran Overhead ( 0 ) : 98.5377s - [COUNTERS] CudaCpp MEs ( 2 ) : 1206.7949s for 90112 events => throughput is 7.47E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1043.3068s + [COUNTERS] Fortran Overhead ( 0 ) : 77.0455s + [COUNTERS] CudaCpp MEs ( 2 ) : 966.2612s for 90112 events => throughput is 9.33E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435710758843E-007) differ by less than 4E-4 (0.0001404387438554977) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361347375693870E-007) differ by less than 4E-4 (0.0001404539737310806) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.692219e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.108680e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.699275e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111948e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694233472680930E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 52.0497s - [COUNTERS] Fortran Overhead ( 0 ) : 24.6638s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.3859s for 8192 events => throughput is 2.99E+02 events/s + [COUNTERS] PROGRAM TOTAL : 39.5436s + [COUNTERS] Fortran Overhead ( 0 ) : 18.1607s + [COUNTERS] CudaCpp MEs ( 2 ) : 21.3829s for 8192 events => throughput is 3.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694765360831655E-006) differ by less than 4E-4 (0.00014234165972015766) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694233472680930E-006) differ by less than 4E-4 (0.0001424934218661189) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361343132853083E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 336.4854s - [COUNTERS] Fortran Overhead ( 0 ) : 29.3396s - [COUNTERS] CudaCpp MEs ( 2 ) : 307.1459s for 90112 events => throughput is 2.93E+02 events/s + [COUNTERS] PROGRAM TOTAL : 255.6894s + [COUNTERS] Fortran Overhead ( 0 ) : 20.7025s + [COUNTERS] CudaCpp MEs ( 2 ) : 234.9869s for 90112 events => throughput is 3.83E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429212586563E-007) differ by less than 4E-4 (0.00014013450003202976) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361343132853083E-007) differ by less than 4E-4 (0.00014025532349459802) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.371429e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.624456e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.391230e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.616503e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694232419162335E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 27.0721s - [COUNTERS] Fortran Overhead ( 0 ) : 12.6637s - [COUNTERS] CudaCpp MEs ( 2 ) : 14.4085s for 8192 events => throughput is 5.69E+02 events/s + [COUNTERS] PROGRAM TOTAL : 17.8468s + [COUNTERS] Fortran Overhead ( 0 ) : 8.2769s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5699s for 8192 events => throughput is 8.56E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694232419162335E-006) differ by less than 4E-4 (0.0001424033203030195) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361342605756045E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 174.3265s - [COUNTERS] Fortran Overhead ( 0 ) : 16.5080s - [COUNTERS] CudaCpp MEs ( 2 ) : 157.8185s for 90112 events => throughput is 5.71E+02 events/s + [COUNTERS] PROGRAM TOTAL : 116.5354s + [COUNTERS] Fortran Overhead ( 0 ) : 10.8269s + [COUNTERS] CudaCpp MEs ( 2 ) : 105.7085s for 90112 events => throughput is 8.52E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361342605756045E-007) differ by less than 4E-4 (0.0001402306447575441) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.735262e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.739370e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 23.8611s - [COUNTERS] Fortran Overhead ( 0 ) : 11.1738s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.6873s for 8192 events => throughput is 6.46E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 154.4819s - [COUNTERS] Fortran Overhead ( 0 ) : 15.1638s - [COUNTERS] CudaCpp MEs ( 2 ) : 139.3180s for 90112 events => throughput is 6.47E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.672552e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.043447e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.678478e+02 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 25.4092s - [COUNTERS] Fortran Overhead ( 0 ) : 12.6834s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.7257s for 8192 events => throughput is 6.44E+02 events/s +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.039749e+03 ) sec^-1 -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768276769753E-006) differ by less than 4E-4 (0.00014259103224434355) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 156.3015s - [COUNTERS] Fortran Overhead ( 0 ) : 16.4754s - [COUNTERS] CudaCpp MEs ( 2 ) : 139.8261s for 90112 events => throughput is 6.44E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435948756818E-007) differ by less than 4E-4 (0.00014044988689865257) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.776081e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.750726e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694234612933678E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 2.5003s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0019s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4984s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 6.2810s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3959s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8851s for 8192 events => throughput is 4.35E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694770708194997E-006) differ by less than 4E-4 (0.00014279896898039546) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694234612933678E-006) differ by less than 4E-4 (0.0001425909413168558) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361349638985098E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 11.2881s - [COUNTERS] Fortran Overhead ( 0 ) : 5.8695s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4186s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 27.6605s + [COUNTERS] Fortran Overhead ( 0 ) : 6.8371s + [COUNTERS] CudaCpp MEs ( 2 ) : 20.8234s for 90112 events => throughput is 4.33E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361443477565656E-007) differ by less than 4E-4 (0.00014080238503022535) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361349638985098E-007) differ by less than 4E-4 (0.00014055994125627969) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635547e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.339238e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.633264e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.447652e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.309560e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.122381e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.405493e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.385965e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.341562e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.151108e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.341458e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.855986e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.336833e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.113295e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.413620e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.045829e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index b9faa14c51..8e208f24ab 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-30_09:07:55 +DATE: 2024-01-31_17:07:16 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 101.9697s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s - [COUNTERS] Fortran MEs ( 1 ) : 101.4926s for 8192 events => throughput is 8.07E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.7288s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3430s + [COUNTERS] Fortran MEs ( 1 ) : 54.3859s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 101.6914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4764s - [COUNTERS] Fortran MEs ( 1 ) : 101.2150s for 8192 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 54.7250s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3459s + [COUNTERS] Fortran MEs ( 1 ) : 54.3790s for 8192 events => throughput is 1.51E+02 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1118.2550s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3831s - [COUNTERS] Fortran MEs ( 1 ) : 1113.8719s for 90112 events => throughput is 8.09E+01 events/s + [COUNTERS] PROGRAM TOTAL : 601.1975s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9597s + [COUNTERS] Fortran MEs ( 1 ) : 598.2379s for 90112 events => throughput is 1.51E+02 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567430116567E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 224.2502s - [COUNTERS] Fortran Overhead ( 0 ) : 103.5045s - [COUNTERS] CudaCpp MEs ( 2 ) : 120.7457s for 8192 events => throughput is 6.78E+01 events/s + [COUNTERS] PROGRAM TOTAL : 175.6033s + [COUNTERS] Fortran Overhead ( 0 ) : 80.7142s + [COUNTERS] CudaCpp MEs ( 2 ) : 94.8891s for 8192 events => throughput is 8.63E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101016896844E-006) differ by less than 2E-4 (6.1113847316107694e-09) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567430116567E-006) differ by less than 2E-4 (6.103790806122333e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347627977553E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1439.9019s - [COUNTERS] Fortran Overhead ( 0 ) : 107.2148s - [COUNTERS] CudaCpp MEs ( 2 ) : 1332.6870s for 90112 events => throughput is 6.76E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1120.6920s + [COUNTERS] Fortran Overhead ( 0 ) : 82.8854s + [COUNTERS] CudaCpp MEs ( 2 ) : 1037.8066s for 90112 events => throughput is 8.68E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347627977553E-007) differ by less than 2E-4 (5.480944587077374e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.977492e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.026980e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.962894e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.025112e+02 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567434129498E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 114.2875s - [COUNTERS] Fortran Overhead ( 0 ) : 54.1504s - [COUNTERS] CudaCpp MEs ( 2 ) : 60.1370s for 8192 events => throughput is 1.36E+02 events/s + [COUNTERS] PROGRAM TOTAL : 78.7358s + [COUNTERS] Fortran Overhead ( 0 ) : 35.0931s + [COUNTERS] CudaCpp MEs ( 2 ) : 43.6427s for 8192 events => throughput is 1.88E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658363352905e-09) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567434129498E-006) differ by less than 2E-4 (6.446994271769313e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347636244846E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 713.5498s - [COUNTERS] Fortran Overhead ( 0 ) : 58.1042s - [COUNTERS] CudaCpp MEs ( 2 ) : 655.4456s for 90112 events => throughput is 1.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 518.4869s + [COUNTERS] Fortran Overhead ( 0 ) : 37.5672s + [COUNTERS] CudaCpp MEs ( 2 ) : 480.9197s for 90112 events => throughput is 1.87E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436284111587E-007) differ by less than 2E-4 (5.866422458922216e-09) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347636244846E-007) differ by less than 2E-4 (5.868020069854651e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.524488e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.361474e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.529646e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.358615e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567435042426E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.7719s - [COUNTERS] Fortran Overhead ( 0 ) : 23.4236s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.3483s for 8192 events => throughput is 3.00E+02 events/s + [COUNTERS] PROGRAM TOTAL : 34.2555s + [COUNTERS] Fortran Overhead ( 0 ) : 15.3193s + [COUNTERS] CudaCpp MEs ( 2 ) : 18.9362s for 8192 events => throughput is 4.33E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567435042426E-006) differ by less than 2E-4 (6.525072038243707e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347633600335E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 326.3386s - [COUNTERS] Fortran Overhead ( 0 ) : 27.1544s - [COUNTERS] CudaCpp MEs ( 2 ) : 299.1842s for 90112 events => throughput is 3.01E+02 events/s + [COUNTERS] PROGRAM TOTAL : 225.2695s + [COUNTERS] Fortran Overhead ( 0 ) : 17.7559s + [COUNTERS] CudaCpp MEs ( 2 ) : 207.5136s for 90112 events => throughput is 4.34E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347633600335E-007) differ by less than 2E-4 (5.744204001345565e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.567666e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.597479e+02 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 45.3921s - [COUNTERS] Fortran Overhead ( 0 ) : 20.5216s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8705s for 8192 events => throughput is 3.29E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 301.9509s - [COUNTERS] Fortran Overhead ( 0 ) : 24.4062s - [COUNTERS] CudaCpp MEs ( 2 ) : 277.5446s for 90112 events => throughput is 3.25E+02 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.101718e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.547813e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.081457e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.519857e+02 ) sec^-1 -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 - [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 48.9503s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9070s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.0433s for 8192 events => throughput is 3.27E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 128/128 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 - [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 301.8849s - [COUNTERS] Fortran Overhead ( 0 ) : 27.8392s - [COUNTERS] CudaCpp MEs ( 2 ) : 274.0457s for 90112 events => throughput is 3.29E+02 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.501339e+02 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.509355e+02 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,22 +357,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1692567356511786E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 3.5767s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8630s for 8192 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 12.6233s + [COUNTERS] Fortran Overhead ( 0 ) : 8.2752s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.3480s for 8192 events => throughput is 1.88E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.279223476620018e-10) +OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567356511786E-006) differ by less than 2E-4 (1.9121515482112272e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -538,65 +390,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358347509627304E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 16.0811s - [COUNTERS] Fortran Overhead ( 0 ) : 6.5876s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4935s for 90112 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 58.4466s + [COUNTERS] Fortran Overhead ( 0 ) : 10.6616s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.7850s for 90112 events => throughput is 1.89E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173717093105324e-11) +OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347509627304E-007) differ by less than 2E-4 (6.022604637223594e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.427839e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.904439e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.087147e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.931601e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.109838e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.220929e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.157967e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.368320e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.106789e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.250028e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.114403e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.156635e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111816e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.216963e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.650481e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.043067e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 1fb13570ed..aefc17f4c0 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-30_06:26:25 +DATE: 2024-01-31_15:26:32 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3322s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s - [COUNTERS] Fortran MEs ( 1 ) : 0.0741s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3177s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2695s + [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2500s - [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2503s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2021s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4169s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6004s - [COUNTERS] Fortran MEs ( 1 ) : 0.8166s for 90112 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6969s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1709s + [COUNTERS] Fortran MEs ( 1 ) : 0.5261s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049452069554313] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.4205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3380s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0825s for 8192 events => throughput is 9.92E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3419s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0698s for 8192 events => throughput is 1.17E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703710) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452069554313) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.6157s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9063s for 90112 events => throughput is 9.94E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2460s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7672s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276513379142) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.011343e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.199571e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.009603e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.201530e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049452069554313] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3388s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2704s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2370s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0334s for 8192 events => throughput is 2.46E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452069554313) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.1314s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6642s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4672s for 90112 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5755s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2083s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3672s for 90112 events => throughput is 2.45E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276513379142) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.951695e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.481551e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.960178e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.483003e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049452069554313] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3038s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2788s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2382s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2210s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452069554313) differ by less than 2E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276513379150] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9222s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2754s for 90112 events => throughput is 3.27E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3855s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1955s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1900s for 90112 events => throughput is 4.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276513379150) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.305728e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.783829e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.308522e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2978s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0219s for 8192 events => throughput is 3.74E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8887s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6471s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2415s for 90112 events => throughput is 3.73E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.868821e+05 ) sec^-1 -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.792718e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.861917e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3207s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0189s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6567s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3622s for 90112 events => throughput is 2.49E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.535117e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.517520e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6879s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6872s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0638s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0555s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 90112 events => throughput is 1.08E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.535376e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.131781e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.380880e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.511409e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.374806e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.787335e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.380768e+07 ) sec^-1 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.782273e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 4985f151b2..e3c6b9eae7 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-30_06:26:56 +DATE: 2024-01-31_15:26:53 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3277s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2529s - [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2535s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2053s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3225s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2485s - [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2506s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2024s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4071s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5926s - [COUNTERS] Fortran MEs ( 1 ) : 0.8145s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6972s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1710s + [COUNTERS] Fortran MEs ( 1 ) : 0.5262s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049433846970949] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.4015s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3277s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3163s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0567s for 8192 events => throughput is 1.45E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050314903825744) differ by less than 4E-4 (7.065505747139156e-07) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049433846970949) differ by less than 4E-4 (6.995380679164498e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276051306751] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.5139s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7000s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8139s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8565s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6236s for 90112 events => throughput is 1.44E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801181770186087) differ by less than 4E-4 (4.0292758352045155e-08) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276051306751) differ by less than 4E-4 (2.119474018513756e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.131056e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.470346e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.131141e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.471180e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049432213942514] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 8192 events => throughput is 3.04E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2433s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2233s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310835231938) differ by less than 4E-4 (8.627325996934943e-07) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049432213942514) differ by less than 4E-4 (7.622276181340482e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801274044068764] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9246s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2778s for 90112 events => throughput is 3.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1970s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2206s for 90112 events => throughput is 4.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177817838580) differ by less than 4E-4 (2.2158326773435988e-07) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801274044068764) differ by less than 4E-4 (1.1326448601245431e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.299610e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.277540e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.290596e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.287798e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049432091919483] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2803s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2672s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2232s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.52E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049432091919483) differ by less than 4E-4 (7.66911902094769e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801273719964992] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7784s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6325s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1459s for 90112 events => throughput is 6.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.2900s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1843s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1057s for 90112 events => throughput is 8.53E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801273719964992) differ by less than 4E-4 (1.2813076100126608e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.309270e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.719343e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.331612e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2783s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2663s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0120s for 8192 events => throughput is 6.84E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7659s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1322s for 90112 events => throughput is 6.82E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.796795e+05 ) sec^-1 -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.987405e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.079276e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050317064561834) differ by less than 4E-4 (6.236059127973093e-07) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8266s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6389s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1877s for 90112 events => throughput is 4.80E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674269399215e-08) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.948471e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.943070e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050319131407651) differ by less than 4E-4 (5.442654378295941e-07) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0580s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0515s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.646730e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.486298e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.789487e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.699254e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.776807e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.780761e+08 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.361160e+07 ) sec^-1 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.995376e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 44df8a9e3d..ccfe354c14 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 + make USEBUILDDIR=1 AVX=512y +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-30_06:27:27 +DATE: 2024-01-31_15:27:12 -On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: -Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3260s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2519s - [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2533s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2051s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3236s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2490s - [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2507s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2026s + [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4069s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5940s - [COUNTERS] Fortran MEs ( 1 ) : 0.8130s for 90112 events => throughput is 1.11E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1727s + [COUNTERS] Fortran MEs ( 1 ) : 0.5263s for 90112 events => throughput is 1.71E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049452042320692] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.4196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3367s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0829s for 8192 events => throughput is 9.88E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.3441s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2747s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657206) differ by less than 2E-4 (1.0382404935782574e-09) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452042320692) differ by less than 2E-4 (1.0454587195951603e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276501483957] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.6225s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7110s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9116s for 90112 events => throughput is 9.89E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0473s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2845s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7628s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608801) differ by less than 2E-4 (5.507531097848073e-10) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276501483957) differ by less than 2E-4 (5.456187723851258e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.000214e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.196605e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.992747e+04 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.195912e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049452042320692] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3393s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2698s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2370s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0328s for 8192 events => throughput is 2.50E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657212) differ by less than 2E-4 (1.0382402715336525e-09) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452042320692) differ by less than 2E-4 (1.0454587195951603e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276501483957] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.1463s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6727s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4735s for 90112 events => throughput is 1.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5686s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2079s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3607s for 90112 events => throughput is 2.50E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608804) differ by less than 2E-4 (5.507529987625048e-10) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276501483957) differ by less than 2E-4 (5.456187723851258e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.936317e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.500614e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.928004e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.499482e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26049452049989580] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3040s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2793s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2381s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2209s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) +OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452049989580) differ by less than 2E-4 (7.510614352668199e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,188 +310,40 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/128 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801276503688793] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9238s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6513s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2724s for 90112 events => throughput is 3.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3765s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1872s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1893s for 90112 events => throughput is 4.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) +OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276503688793) differ by less than 2E-4 (4.4448533742524887e-10) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.358469e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.826787e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.409112e+05 ) sec^-1 - -*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2986s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) - -*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.857511e+05 ) sec^-1 -*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8897s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s - -*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) - -*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.861136e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.873456e+05 ) sec^-1 - -*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -8192 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3231s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2893s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 8192 events => throughput is 2.42E+05 events/s +*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** -*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) - -*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical - -*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0365s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6643s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3722s for 90112 events => throughput is 2.42E+05 events/s - -*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) - -*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical - -*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.448669e+05 ) sec^-1 - -*** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.422091e+05 ) sec^-1 +*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -505,98 +357,15 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 - [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6882s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s - -*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029699) differ by less than 2E-4 (3.329714282074292e-10) - -*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical - -*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** --------------------- -CUDACPP_RUNTIME_FBRIDGEMODE = (not set) -CUDACPP_RUNTIME_VECSIZEUSED = 8192 --------------------- -81920 1 1 ! Number of events and max and min iterations -0.000001 ! Accuracy (ignored because max iterations = min iterations) -0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) -1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) -0 ! Helicity Sum/event 0=exact -1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) --------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/4 - [NGOODHEL] ngoodhel/ncomb = 16/32 - [XSECTION] VECSIZE_USED = 8192 - [XSECTION] MultiChannel = TRUE - [XSECTION] Configuration = 1 - [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 - [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0663s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0581s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 90112 events => throughput is 1.10E+07 events/s - -*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** - -OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182637219937) differ by less than 2E-4 (5.227208665914418e-10) - -*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** - -OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical - -*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.534715e+07 ) sec^-1 - -*** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.123919e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382422e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.503129e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.385930e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.826918e+07 ) sec^-1 - -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.379565e+07 ) sec^-1 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' +ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed + PDF set = nn23lo1 + alpha_s(Mz)= 0.1300 running at 2 loops. + alpha_s(Mz)= 0.1300 running at 2 loops. + Renormalization scale set on event-by-event basis + Factorization scale set on event-by-event basis -*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.789199e+07 ) sec^-1 -TEST COMPLETED + getting user params +Enter number of events and max and min iterations: + Number of events and iterations 8192 1 1 From 60299b79cdd8ac7bd6213be1d4331db807519fa9 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Wed, 31 Jan 2024 18:34:24 +0200 Subject: [PATCH 96/96] [jt774] ** COMPLETE JT774 ** go back to tput and tmad logs from itscrd90 before merging to master --- .../log_eemumu_mad_d_inl0_hrd0.txt | 422 ++++++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 424 +++++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 424 +++++++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 418 ++++++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 420 ++++++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 420 ++++++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 422 ++++++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 424 +++++++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 424 +++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 420 ++++++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 449 +++++++++++++----- .../log_gqttq_mad_f_inl0_hrd0.txt | 449 +++++++++++++----- .../log_gqttq_mad_m_inl0_hrd0.txt | 443 ++++++++++++----- .../log_eemumu_mad_d_inl0_hrd0.txt | 227 +++++---- .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 234 +++++---- .../log_eemumu_mad_d_inl0_hrd0_common.txt | 213 +++++---- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 210 +++++--- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 229 +++++---- .../log_eemumu_mad_d_inl0_hrd1.txt | 227 +++++---- .../log_eemumu_mad_d_inl1_hrd0.txt | 225 +++++---- .../log_eemumu_mad_d_inl1_hrd1.txt | 225 +++++---- .../log_eemumu_mad_f_inl0_hrd0.txt | 239 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 246 ++++++---- .../log_eemumu_mad_f_inl0_hrd0_common.txt | 227 +++++---- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 222 ++++++--- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 241 ++++++---- .../log_eemumu_mad_f_inl0_hrd1.txt | 239 ++++++---- .../log_eemumu_mad_f_inl1_hrd0.txt | 237 +++++---- .../log_eemumu_mad_f_inl1_hrd1.txt | 237 +++++---- .../log_eemumu_mad_m_inl0_hrd0.txt | 227 +++++---- .../log_eemumu_mad_m_inl0_hrd1.txt | 227 +++++---- .../log_ggtt_mad_d_inl0_hrd0.txt | 227 +++++---- .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 234 +++++---- .../log_ggtt_mad_d_inl0_hrd0_common.txt | 213 +++++---- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 210 +++++--- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 229 +++++---- .../log_ggtt_mad_d_inl0_hrd1.txt | 227 +++++---- .../log_ggtt_mad_d_inl1_hrd0.txt | 225 +++++---- .../log_ggtt_mad_d_inl1_hrd1.txt | 225 +++++---- .../log_ggtt_mad_f_inl0_hrd0.txt | 245 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 252 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_common.txt | 239 ++++++---- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 228 ++++++--- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 247 ++++++---- .../log_ggtt_mad_f_inl0_hrd1.txt | 245 ++++++---- .../log_ggtt_mad_f_inl1_hrd0.txt | 239 ++++++---- .../log_ggtt_mad_f_inl1_hrd1.txt | 239 ++++++---- .../log_ggtt_mad_m_inl0_hrd0.txt | 225 +++++---- .../log_ggtt_mad_m_inl0_hrd1.txt | 225 +++++---- .../log_ggttg_mad_d_inl0_hrd0.txt | 250 ++++++---- .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 258 ++++++---- .../log_ggttg_mad_d_inl0_hrd1.txt | 250 ++++++---- .../log_ggttg_mad_f_inl0_hrd0.txt | 264 +++++----- .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 272 ++++++----- .../log_ggttg_mad_f_inl0_hrd1.txt | 264 +++++----- .../log_ggttg_mad_m_inl0_hrd0.txt | 250 ++++++---- .../log_ggttg_mad_m_inl0_hrd1.txt | 250 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0.txt | 250 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 258 ++++++---- .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 234 +++++---- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 228 ++++++--- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 253 ++++++---- .../log_ggttgg_mad_d_inl0_hrd1.txt | 250 ++++++---- .../log_ggttgg_mad_d_inl1_hrd0.txt | 252 ++++++---- .../log_ggttgg_mad_d_inl1_hrd1.txt | 252 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0.txt | 266 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 274 ++++++----- .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 258 +++++----- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 244 ++++++---- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 269 ++++++----- .../log_ggttgg_mad_f_inl0_hrd1.txt | 266 ++++++----- .../log_ggttgg_mad_f_inl1_hrd0.txt | 270 ++++++----- .../log_ggttgg_mad_f_inl1_hrd1.txt | 270 ++++++----- .../log_ggttgg_mad_m_inl0_hrd0.txt | 246 ++++++---- .../log_ggttgg_mad_m_inl0_hrd1.txt | 246 ++++++---- .../log_ggttggg_mad_d_inl0_hrd0.txt | 250 ++++++---- .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 258 ++++++---- .../log_ggttggg_mad_d_inl0_hrd1.txt | 250 ++++++---- .../log_ggttggg_mad_f_inl0_hrd0.txt | 266 ++++++----- .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 274 ++++++----- .../log_ggttggg_mad_f_inl0_hrd1.txt | 266 ++++++----- .../log_ggttggg_mad_m_inl0_hrd0.txt | 250 ++++++---- .../log_ggttggg_mad_m_inl0_hrd1.txt | 250 ++++++---- .../log_gqttq_mad_d_inl0_hrd0.txt | 253 +++++++--- .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 263 +++++++--- .../log_gqttq_mad_d_inl0_hrd1.txt | 253 +++++++--- .../log_gqttq_mad_f_inl0_hrd0.txt | 253 +++++++--- .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 263 +++++++--- .../log_gqttq_mad_f_inl0_hrd1.txt | 253 +++++++--- .../log_gqttq_mad_m_inl0_hrd0.txt | 253 +++++++--- .../log_gqttq_mad_m_inl0_hrd1.txt | 253 +++++++--- 96 files changed, 16959 insertions(+), 9768 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index f1aae0ab32..459e70d382 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-31_15:11:39 +DATE: 2024-01-30_06:09:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.5205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5147s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.40E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6491s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6403s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1389s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1329s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1850s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0089s for 8192 events => throughput is 9.22E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2608s - [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4417s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3482s + [COUNTERS] Fortran MEs ( 1 ) : 0.0935s for 90112 events => throughput is 9.64E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681790] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1436s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 8192 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1896s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681790) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,8 +158,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,27 +167,27 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3369s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2716s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0652s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3566s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0811s for 90112 events => throughput is 1.11E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.419345e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.115404e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.429602e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.135181e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1461s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1426s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1812s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.96E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681787) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3081s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0384s for 90112 events => throughput is 2.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4018s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0476s for 90112 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813683E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.383624e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.873422e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.448451e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.997339e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1436s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1411s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681787) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2943s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2673s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0270s for 90112 events => throughput is 3.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3558s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 90112 events => throughput is 2.53E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813683E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.415643e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.563581e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.701936e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1820s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1791s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.77E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3550s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0336s for 90112 events => throughput is 2.68E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.733041e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.512459e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.837193e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1852s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3589s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0397s for 90112 events => throughput is 2.27E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.277912e+06 ) sec^-1 -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.388817e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.4152s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4147s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 1.96E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6311s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169064681787) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.5465s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5421s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.03E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7893s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7841s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.73E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.179667e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.924722e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.504851e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.934037e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.224816e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.691813e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.876504e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.449601e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.241030e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.683159e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.934736e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.033957e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.196667e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.708340e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.594732e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.130631e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 41d31a1a79..161c62cc9b 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-31_15:11:59 +DATE: 2024-01-30_06:09:46 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.4682s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4623s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6495s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6408s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.47E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1412s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1353s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.59E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3253s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2628s - [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3487s + [COUNTERS] Fortran MEs ( 1 ) : 0.0934s for 90112 events => throughput is 9.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747165804194701] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1463s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1411s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1909s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747165804194701) differ by less than 4E-4 (1.4992696639737346e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165492032638) differ by less than 4E-4 (1.6428111293542713e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501906417651019E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3262s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0565s for 90112 events => throughput is 1.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3569s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0773s for 90112 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501906417651019E-002) differ by less than 4E-4 (1.473975921317816e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905274264717E-002) differ by less than 4E-4 (1.5989335488963974e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.648536e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.185144e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.666313e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.215006e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747170102104563] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1412s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1390s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.85E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1786s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747170102104563) differ by less than 4E-4 (4.770380779284267e-08) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165570339780) differ by less than 4E-4 (1.6068031594151932e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501924220365086E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2893s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2660s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.87E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3531s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0293s for 90112 events => throughput is 3.08E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501924220365086E-002) differ by less than 4E-4 (4.716350665567859e-08) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263464411127e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.091663e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.133589e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.169619e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.329458e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747170107722058] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1409s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.62E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1817s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.43E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747170107722058) differ by less than 4E-4 (4.7962117166733265e-08) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501924223714337E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.2842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2648s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 90112 events => throughput is 4.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0262s for 90112 events => throughput is 3.44E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501924223714337E-002) differ by less than 4E-4 (4.7200109598577455e-08) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.839743e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.621496e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.716307e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1824s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.63E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165593922979) differ by less than 4E-4 (1.5959588972602745e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 90112 events => throughput is 3.63E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.744053e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.040701e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.971296e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1822s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1798s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.36E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166446533123) differ by less than 4E-4 (1.2039032049049325e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 90112 events => throughput is 3.42E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501908990866423E-002) differ by less than 4E-4 (1.1927560927826875e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.615840e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.903927e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166473699148] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.4196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.83E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6073s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747166473699148) differ by less than 4E-4 (1.191411457268643e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166823487174) differ by less than 4E-4 (1.0305684361444634e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501909133729534E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.5481s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5450s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 90112 events => throughput is 2.97E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.89E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501909133729534E-002) differ by less than 4E-4 (1.1771429675455636e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439961927435e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.685608e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.032746e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.836295e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.810870e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.323104e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.874936e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.462917e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.028452e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.314000e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.891915e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.463292e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.234607e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.079480e+08 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.256002e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.317614e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.441320e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index b6803d0924..f51b70af46 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-01-31_15:12:19 +DATE: 2024-01-30_06:10:03 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.4683s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4624s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6504s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6416s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x1_fortran > /tmp/valassia/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681787] fbridge_mode=0 + [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1407s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1349s - [COUNTERS] Fortran MEs ( 1 ) : 0.0059s for 8192 events => throughput is 1.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1761s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_eemumu_x10_fortran > /tmp/valassia/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813683E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3253s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2628s - [COUNTERS] Fortran MEs ( 1 ) : 0.0625s for 90112 events => throughput is 1.44E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4424s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3488s + [COUNTERS] Fortran MEs ( 1 ) : 0.0936s for 90112 events => throughput is 9.63E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211725] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1503s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1443s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0060s for 8192 events => throughput is 1.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1857s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169074211725) differ by less than 2E-4 (4.382150198267709e-10) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211736) differ by less than 2E-4 (4.3821613004979554e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919915927141E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3366s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0654s for 90112 events => throughput is 1.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3600s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 90112 events => throughput is 1.10E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919915927141E-002) differ by less than 2E-4 (1.2145595640333795e-10) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.409008e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.101615e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.431496e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.125271e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211722] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1440s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1406s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.37E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.94E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169074211722) differ by less than 2E-4 (4.382150198267709e-10) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211734) differ by less than 2E-4 (4.382159080051906e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919915927141E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3065s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2686s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0379s for 90112 events => throughput is 2.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4024s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3561s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 90112 events => throughput is 1.94E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919915927141E-002) differ by less than 2E-4 (1.2145595640333795e-10) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.495068e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.982524e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.513199e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.072335e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169063975919] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1429s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1404s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.21E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169063975919) differ by less than 2E-4 (3.2457925236428764e-11) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919908700713E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3018s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2736s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3912s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3545s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 90112 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919908700713E-002) differ by less than 2E-4 (4.248024154662744e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.286014e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.431608e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.633191e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 90112 events => throughput is 2.63E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.754832e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.421261e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.841477e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 + [UNWEIGHT] Wrote 1611 events (found 1616 events) + [COUNTERS] PROGRAM TOTAL : 0.1840s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169063975949) differ by less than 2E-4 (3.24560378572869e-11) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1803 events (found 1808 events) + [COUNTERS] PROGRAM TOTAL : 0.3972s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3574s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 90112 events => throughput is 2.26E+06 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.333320e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.456277e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x1_cudacpp > /tmp/valassia/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169066587291] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.4169s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4165s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0004s for 8192 events => throughput is 2.01E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6100s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6095s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.62E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681787) and cpp (0.21747169066587291) differ by less than 2E-4 (8.762079950486168e-11) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169066587257) differ by less than 2E-4 (8.761968928183705e-11) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_eemumu_x10_cudacpp > /tmp/valassia/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 - [NGOODHEL] ngoodhel/ncomb = 16/16 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919911173651E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.5500s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5456s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0044s for 90112 events => throughput is 2.03E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7838s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.74E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813683E-002) and cpp (9.1501919911173651E-002) differ by less than 2E-4 (6.950640063507763e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.163304e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.926990e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.502888e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.883911e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.287328e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.714682e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.861330e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.463238e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.282882e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.709935e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.943867e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.999719e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.187137e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.716961e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.597298e+07 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.154425e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index b3de38a77a..6a2d60f404 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-31_15:12:39 +DATE: 2024-01-30_06:10:22 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3751s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3466s - [COUNTERS] Fortran MEs ( 1 ) : 0.0286s for 8192 events => throughput is 2.87E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4078s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3635s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2547s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2263s - [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3357s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3324s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0216s - [COUNTERS] Fortran MEs ( 1 ) : 0.3108s for 90112 events => throughput is 2.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8736s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3937s + [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2945s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2625s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0320s for 8192 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3307s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775350] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4059s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3524s for 90112 events => throughput is 2.56E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8532s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4213s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4319s for 90112 events => throughput is 2.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775350) differ by less than 2E-14 (7.771561172376096e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.576965e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.120291e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.601266e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.118287e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2673s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2494s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0179s for 8192 events => throughput is 4.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 8192 events => throughput is 3.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2344s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0399s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1945s for 90112 events => throughput is 4.63E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6510s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2487s for 90112 events => throughput is 3.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775372) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.737407e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.657266e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.749563e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.745669e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2404s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 8192 events => throughput is 8.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3190s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3046s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.69E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1438s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0314s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1124s for 90112 events => throughput is 8.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5517s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3928s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1589s for 90112 events => throughput is 5.67E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775372) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.267343e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.690369e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.309996e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.998122e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3210s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.64E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5235s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 90112 events => throughput is 6.64E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.740357e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.934348e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3284s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.6116s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3942s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2174s for 90112 events => throughput is 4.15E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.190038e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.219873e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.5148s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5141s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.39E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690708277600123) differ by less than 2E-14 (0.0) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3135s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3056s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0079s for 90112 events => throughput is 1.15E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8076s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8005s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 90112 events => throughput is 1.27E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223782291775379) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.274881e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.036838e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.756760e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.660860e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.762850e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.989378e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.751326e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.069849e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.777828e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.996952e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.947188e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.150687e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.738816e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991689e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161420e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.999027e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 787d8bcbcc..fe11b37e1c 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-31_15:13:04 +DATE: 2024-01-30_06:10:49 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.2988s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2704s - [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3836s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3391s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2566s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2282s - [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3381s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3327s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0218s - [COUNTERS] Fortran MEs ( 1 ) : 0.3109s for 90112 events => throughput is 2.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3952s + [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690704859565422] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2856s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0275s for 8192 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690704859565422) differ by less than 4E-4 (7.167087312520692e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703999052587) differ by less than 4E-4 (8.971448917094449e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780988783801] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3514s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0486s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3028s for 90112 events => throughput is 2.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8092s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4011s for 90112 events => throughput is 2.25E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223780988783801) differ by less than 4E-4 (2.8188770428982934e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780103711483) differ by less than 4E-4 (4.733632297249102e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.092595e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.286506e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.100516e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.292834e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703261737923] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2593s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2465s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3043s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690703261737923) differ by less than 4E-4 (1.0517483139960149e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690699958440689) differ by less than 4E-4 (1.744398380187917e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223779141681696] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1796s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0385s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1411s for 90112 events => throughput is 6.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5555s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1657s for 90112 events => throughput is 5.44E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223779141681696) differ by less than 4E-4 (6.814876529759317e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223776162337749) differ by less than 4E-4 (1.326035499182865e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.726889e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.487444e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.730261e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.523247e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690694815027804] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2435s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2373s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3066s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2985s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690694815027804) differ by less than 4E-4 (2.8228920900819077e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223776468660184] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.0950s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0268s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0682s for 90112 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4802s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223776468660184) differ by less than 4E-4 (1.2597660581370462e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.370439e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.007163e+06 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.012972e+06 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3042s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.4699s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3862s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 90112 events => throughput is 1.08E+06 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.018834e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.381975e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.101159e+06 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.33E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690698822141186) differ by less than 4E-4 (1.982662718447159e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5088s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3876s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1212s for 90112 events => throughput is 7.43E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223780266165058) differ by less than 4E-4 (4.382182106077437e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.590700e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.687831e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690697792016230] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.5141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5138s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0003s for 8192 events => throughput is 2.34E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690697792016230) differ by less than 4E-4 (2.198663905383924e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697987) differ by less than 4E-4 (1.0232396008280631e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223779043453305] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3113s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3073s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 90112 events => throughput is 2.26E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8220s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 90112 events => throughput is 1.47E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223779043453305) differ by less than 4E-4 (7.027382697977202e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376677454826e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.948491e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.211265e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.013424e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.993599e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.066147e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.733080e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.908417e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.767769e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.065652e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.726692e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.013454e+08 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.882266e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.070656e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.370639e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.176885e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.407782e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 55dc817a38..a855e5b8c2 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-01-31_15:13:29 +DATE: 2024-01-30_06:11:16 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.2863s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2579s - [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3775s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3336s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x1_fortran > /tmp/valassia/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600123] fbridge_mode=0 + [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2590s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2305s - [COUNTERS] Fortran MEs ( 1 ) : 0.0284s for 8192 events => throughput is 2.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggtt_x10_fortran > /tmp/valassia/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3358s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0250s - [COUNTERS] Fortran MEs ( 1 ) : 0.3108s for 90112 events => throughput is 2.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8813s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4017s + [COUNTERS] Fortran MEs ( 1 ) : 0.4796s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709601032033] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2968s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2636s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 8192 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690709601032033) differ by less than 2E-4 (2.7750309383733907e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032019) differ by less than 2E-4 (2.77503091616893e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783635280981] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4150s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0523s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3627s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8620s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4398s for 90112 events => throughput is 2.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223783635280981) differ by less than 2E-4 (2.906524576573588e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.551566e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.094601e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.553689e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.081772e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709601032033] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2651s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2478s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3338s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690709601032033) differ by less than 2E-4 (2.7750309383733907e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032026) differ by less than 2E-4 (2.7750309383733907e-08) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,8 +234,8 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.2268s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0370s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1898s for 90112 events => throughput is 4.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6431s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2435s for 90112 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223783635280974) differ by less than 2E-4 (2.906524576573588e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.772802e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.699859e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.772941e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.728578e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709643441529] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2517s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2417s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3066s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.81E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690709643441529) differ by less than 2E-4 (2.8639570492927646e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783660238837] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.1412s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0313s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1099s for 90112 events => throughput is 8.20E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5524s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3953s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1572s for 90112 events => throughput is 5.73E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223783660238837) differ by less than 2E-4 (2.9605181195435648e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.452588e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.934952e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.508771e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.894708e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.74E+05 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.5411s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4039s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1372s for 90112 events => throughput is 6.57E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.847513e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.835511e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 + [UNWEIGHT] Wrote 434 events (found 1125 events) + [COUNTERS] PROGRAM TOTAL : 0.3300s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 + [UNWEIGHT] Wrote 1727 events (found 1732 events) + [COUNTERS] PROGRAM TOTAL : 1.6128s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4020s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2108s for 90112 events => throughput is 4.27E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.284904e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.343219e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x1_cudacpp > /tmp/valassia/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708266690727] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.5142s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5135s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 1.06E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7251s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600123) and cpp (47.690708266690727) differ by less than 2E-4 (2.2875312755132882e-10) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690706) differ by less than 2E-4 (2.2875334959593374e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggtt_x10_cudacpp > /tmp/valassia/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782303744805] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3130s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3053s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0077s for 90112 events => throughput is 1.17E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8142s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775386) and cpp (46.223782303744805) differ by less than 2E-4 (2.5894508759449764e-10) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.208836e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.009523e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.819798e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.569379e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.804520e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.989648e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.803586e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.069002e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.807496e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.992431e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.142771e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.769915e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.993358e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.182699e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.008591e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 9533845a25..ad1d0f839b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 - +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:13:54 +DATE: 2024-01-30_06:11:43 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.4630s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2594s - [COUNTERS] Fortran MEs ( 1 ) : 0.2036s for 8192 events => throughput is 4.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5934s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s + [COUNTERS] Fortran MEs ( 1 ) : 0.3485s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3970s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1933s - [COUNTERS] Fortran MEs ( 1 ) : 0.2036s for 8192 events => throughput is 4.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2379s + [COUNTERS] Fortran MEs ( 1 ) : 0.3492s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909590E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.3928s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1576s - [COUNTERS] Fortran MEs ( 1 ) : 2.2352s for 90112 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5565s + [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954626E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.7532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2822s for 8192 events => throughput is 2.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3431s for 8192 events => throughput is 2.39E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954626E-002) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909604E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.5342s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4286s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1055s for 90112 events => throughput is 2.90E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7045s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9313s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7731s for 90112 events => throughput is 2.39E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909604E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.974124e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.459637e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.990607e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.457759e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954598E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4654s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3301s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1353s for 8192 events => throughput is 6.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5930s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1764s for 8192 events => throughput is 4.64E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954598E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909618E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7790s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2917s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4873s for 90112 events => throughput is 6.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7266s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7761s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9506s for 90112 events => throughput is 4.62E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909618E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.140787e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.735934e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.147421e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.731299e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2608s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0668s for 8192 events => throughput is 1.23E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0935s for 8192 events => throughput is 8.76E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954640E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909576E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9583s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2225s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7358s for 90112 events => throughput is 1.22E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6908s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6878s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0029s for 90112 events => throughput is 8.98E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909576E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.245428e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.260876e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.205229e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.4089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0830s for 8192 events => throughput is 9.87E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.5356s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6675s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8681s for 90112 events => throughput is 1.04E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.066040e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.274058e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.071802e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.4633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1112s for 8192 events => throughput is 7.37E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470750E-002) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.9267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7103s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2164s for 90112 events => throughput is 7.41E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.521586e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.533151e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954653E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5093s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702725954653E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470764E-002) differ by less than 2E-14 (0.0) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909604E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.5423s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4570s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0852s for 90112 events => throughput is 1.06E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0439s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271073909604E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655610E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.099505e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.630499e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.136233e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.083902e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.679574e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.662154e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.301174e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243596e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.679806e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.668083e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.846610e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.255740e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.661806e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.633959e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.486117e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.773665e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index e544f39758..c17be1788d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:14:33 +DATE: 2024-01-30_06:12:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.4007s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1973s - [COUNTERS] Fortran MEs ( 1 ) : 0.2035s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s + [COUNTERS] Fortran MEs ( 1 ) : 0.3480s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4001s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1965s - [COUNTERS] Fortran MEs ( 1 ) : 0.2036s for 8192 events => throughput is 4.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5918s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2437s + [COUNTERS] Fortran MEs ( 1 ) : 0.3481s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909590E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4016s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1641s - [COUNTERS] Fortran MEs ( 1 ) : 2.2376s for 90112 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5607s + [COUNTERS] Fortran MEs ( 1 ) : 3.8294s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196694166750697E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.7038s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4463s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2575s for 8192 events => throughput is 3.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8754s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5552s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3203s for 8192 events => throughput is 2.56E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196694166750697E-002) differ by less than 4E-4 (8.806064100141953e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347758884971E-002) differ by less than 4E-4 (1.0456755794585604e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310258386649639E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.2441s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4085s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8356s for 90112 events => throughput is 3.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4534s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9123s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5411s for 90112 events => throughput is 2.54E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310258386649639E-002) differ by less than 4E-4 (1.5603514513795602e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310858119443913E-002) differ by less than 4E-4 (1.7166476384833373e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.298484e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651171e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.293545e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.640512e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196680760393742E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3535s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2781s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0754s for 8192 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3378s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0976s for 8192 events => throughput is 8.39E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196680760393742E-002) differ by less than 4E-4 (2.2599080296004104e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196323434217816E-002) differ by less than 4E-4 (3.548307125900152e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310249885719388E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0647s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2349s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8298s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6927s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0816s for 90112 events => throughput is 8.33E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310249885719388E-002) differ by less than 4E-4 (2.6058442459397924e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842598054087E-002) differ by less than 4E-4 (3.625542406293647e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107341e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.607319e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.107225e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.623071e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196679618405488E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.2659s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2315s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0344s for 8192 events => throughput is 2.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3345s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196679618405488E-002) differ by less than 4E-4 (2.3774005186716352e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310249280068872E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.5665s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1912s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 90112 events => throughput is 2.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1519s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5128s for 90112 events => throughput is 1.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310249280068872E-002) differ by less than 4E-4 (2.680330594140301e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.454107e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.801449e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.824776e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.3234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.0864s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4508s for 90112 events => throughput is 2.00E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.066287e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.462442e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.073001e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.3518s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0546s for 8192 events => throughput is 1.50E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344080460087E-002) differ by less than 4E-4 (1.4241285339888776e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.2586s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6080s for 90112 events => throughput is 1.48E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310857813116089E-002) differ by less than 4E-4 (1.754321300451167e-07) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.497722e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.492408e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196692039411392E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4883s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4862s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0021s for 8192 events => throughput is 3.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.66E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196692039411392E-002) differ by less than 4E-4 (1.0994759025440004e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366366022E-002) differ by less than 4E-4 (8.802906736882221e-08) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310258751737655E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.4714s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4486s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0322s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0221s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 90112 events => throughput is 8.88E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310258751737655E-002) differ by less than 4E-4 (1.5154508492543073e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310864949473954E-002) differ by less than 4E-4 (8.766578729613173e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.587112e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.288695e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.268423e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.864373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.466901e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.630957e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.085456e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.365812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.466932e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.633946e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.636891e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471906e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.418919e+07 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.509519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.306800e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624050e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index fb1abbbf81..daa5ca9a3d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:15:08 +DATE: 2024-01-30_06:13:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.4003s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1966s - [COUNTERS] Fortran MEs ( 1 ) : 0.2037s for 8192 events => throughput is 4.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5921s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2413s + [COUNTERS] Fortran MEs ( 1 ) : 0.3507s for 8192 events => throughput is 2.34E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x1_fortran > /tmp/valassia/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702725954640E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4213s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2174s - [COUNTERS] Fortran MEs ( 1 ) : 0.2039s for 8192 events => throughput is 4.02E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5875s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2393s + [COUNTERS] Fortran MEs ( 1 ) : 0.3482s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttg_x10_fortran > /tmp/valassia/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271073909590E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.3970s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1592s - [COUNTERS] Fortran MEs ( 1 ) : 2.2378s for 90112 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3863s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5579s + [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196703561337638E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.7658s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4782s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2876s for 8192 events => throughput is 2.85E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9324s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3490s for 8192 events => throughput is 2.35E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196703561337638E-002) differ by less than 2E-4 (8.594766898184503e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358763382021E-002) differ by less than 2E-4 (8.651674487936134e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271828760453E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 4.6035s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4383s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.1652s for 90112 events => throughput is 2.85E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7762s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9305s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8457s for 90112 events => throughput is 2.34E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271828760453E-002) differ by less than 2E-4 (9.283585677977158e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.314319981967856e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.911261e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.405521e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.920789e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.412039e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196703601584347E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4641s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3283s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1358s for 8192 events => throughput is 6.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5895s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4134s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1762s for 8192 events => throughput is 4.65E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196703601584347E-002) differ by less than 2E-4 (9.008841672653034e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358804670424E-002) differ by less than 2E-4 (9.076468243662816e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271831113598E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7822s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2904s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4918s for 90112 events => throughput is 6.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7571s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7661s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9910s for 90112 events => throughput is 4.53E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271831113598E-002) differ by less than 2E-4 (9.312525861560061e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.33619492826665e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.206115e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.812264e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.194509e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.790866e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196703386139241E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3267s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2609s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0658s for 8192 events => throughput is 1.25E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0887s for 8192 events => throughput is 9.23E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196703386139241E-002) differ by less than 2E-4 (6.792252982279479e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271701289558E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9463s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2225s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7238s for 90112 events => throughput is 1.24E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6655s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6760s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9895s for 90112 events => throughput is 9.11E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271701289558E-002) differ by less than 2E-4 (7.715875938174577e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.278290e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.369009e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.305282e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.3954s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.5272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8582s for 90112 events => throughput is 1.05E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.086568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.278214e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099441e+05 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 41 events (found 467 events) + [COUNTERS] PROGRAM TOTAL : 0.4677s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1134s for 8192 events => throughput is 7.22E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 679 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 2.9514s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7052s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2463s for 90112 events => throughput is 7.23E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.364043e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.347069e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x1_cudacpp > /tmp/valassia/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196702904173926E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5030s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4952s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6759s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196702725954640E-002) and cpp (9.7196702904173926E-002) differ by less than 2E-4 (1.8335939433455906e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981231E-002) differ by less than 2E-4 (1.8571730819871846e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttg_x10_cudacpp > /tmp/valassia/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310271062722053E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.5406s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4551s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0855s for 90112 events => throughput is 1.05E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0449s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0215s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.86E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310271073909590E-002) and cpp (8.1310271062722053E-002) differ by less than 2E-4 (1.375907165979129e-10) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634160E-002) differ by less than 2E-4 (1.109495828544027e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.099800e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624283e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.115814e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.218126e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.678377e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.599538e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.299070e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.232652e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.677334e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.616286e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.848766e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.243703e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.659540e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.609732e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.479459e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.728637e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 5ebf048e8c..930476d789 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:15:47 +DATE: 2024-01-30_06:13:52 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 2.8125s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s - [COUNTERS] Fortran MEs ( 1 ) : 2.4995s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3035s + [COUNTERS] Fortran MEs ( 1 ) : 4.4093s for 8192 events => throughput is 1.86E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.7290s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2303s - [COUNTERS] Fortran MEs ( 1 ) : 2.4987s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2934s + [COUNTERS] Fortran MEs ( 1 ) : 4.4261s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.0084s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4985s - [COUNTERS] Fortran MEs ( 1 ) : 27.5099s for 90112 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8146s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0980s + [COUNTERS] Fortran MEs ( 1 ) : 48.7166s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310842E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 7.6700s - [COUNTERS] Fortran Overhead ( 0 ) : 3.8930s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7770s for 8192 events => throughput is 2.17E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.5114s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8392s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6722s for 8192 events => throughput is 1.75E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310842E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 46.7287s - [COUNTERS] Fortran Overhead ( 0 ) : 5.1782s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.5505s for 90112 events => throughput is 2.17E+03 events/s + [COUNTERS] PROGRAM TOTAL : 57.8440s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6165s + [COUNTERS] CudaCpp MEs ( 2 ) : 51.2275s for 90112 events => throughput is 1.76E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587238E-004) differ by less than 2E-14 (0.0) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421150E-004) differ by less than 2E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.244212e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.804400e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.246522e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.805886e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310842E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 3.5212s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8575s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6637s for 8192 events => throughput is 4.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0600s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6542s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4058s for 8192 events => throughput is 3.41E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310842E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587233E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 21.4014s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1434s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.2579s for 90112 events => throughput is 4.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6354s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3949s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2405s for 90112 events => throughput is 3.43E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587233E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421156E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.075868e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.603718e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.072770e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.615187e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310837E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.5929s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9056s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6873s for 8192 events => throughput is 1.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3469s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0424s for 8192 events => throughput is 7.86E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310837E-004) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587241E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 9.7834s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1783s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.6051s for 90112 events => throughput is 1.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 14.6028s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0873s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.5154s for 90112 events => throughput is 7.83E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587241E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.219218e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.099934e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.056733e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.1181s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1916s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9265s for 8192 events => throughput is 8.84E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 13.1007s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9669s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.1338s for 90112 events => throughput is 8.89E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.168199e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.219478e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.198606e+03 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.6312s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4569s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1743s for 8192 events => throughput is 6.98E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 16.1783s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2363s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.9420s for 90112 events => throughput is 6.96E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.039911e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.063323e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310831E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8403s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7248s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1155s for 8192 events => throughput is 7.09E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8345s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295813310831E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 3.2629s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9956s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2672s for 90112 events => throughput is 7.11E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.9375s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5725s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3650s for 90112 events => throughput is 2.47E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455880587238E-004) differ by less than 2E-14 (0.0) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421166E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.179211e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.275540e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.457940e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.510631e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240898e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.114953e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.033520e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.167963e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.233836e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.105704e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.225572e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168745e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240859e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.099883e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.392782e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.425014e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 223f61cd19..5e8ad575df 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:19:35 +DATE: 2024-01-30_06:18:23 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 2.7309s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2329s - [COUNTERS] Fortran MEs ( 1 ) : 2.4980s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2990s + [COUNTERS] Fortran MEs ( 1 ) : 4.4236s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.7301s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2310s - [COUNTERS] Fortran MEs ( 1 ) : 2.4991s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s + [COUNTERS] Fortran MEs ( 1 ) : 4.4253s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.9804s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4991s - [COUNTERS] Fortran MEs ( 1 ) : 27.4813s for 90112 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8315s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1047s + [COUNTERS] Fortran MEs ( 1 ) : 48.7267s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287415397046849E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 6.9110s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5360s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3750s for 8192 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7124s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4408s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2715s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287415397046849E-004) differ by less than 4E-4 (3.295471137976236e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396352122325E-004) differ by less than 4E-4 (3.2814141017745158e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803504352744863E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 41.9127s - [COUNTERS] Fortran Overhead ( 0 ) : 4.8008s - [COUNTERS] CudaCpp MEs ( 2 ) : 37.1120s for 90112 events => throughput is 2.43E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.4678s + [COUNTERS] Fortran Overhead ( 0 ) : 6.2349s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2329s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803504352744863E-004) differ by less than 4E-4 (3.0671872019993884e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774048965294E-004) differ by less than 4E-4 (3.056275773571926e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.486541e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.973797e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.486031e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.974372e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287408821488631E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.8917s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0538s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8378s for 8192 events => throughput is 9.78E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6573s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4642s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1930s for 8192 events => throughput is 6.87E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287408821488631E-004) differ by less than 4E-4 (3.1142628647007342e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277387698033752E-004) differ by less than 4E-4 (3.0428601303089664e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803502293058018E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 11.5963s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3323s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.2641s for 90112 events => throughput is 9.73E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.3220s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2267s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0952s for 90112 events => throughput is 6.88E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803502293058018E-004) differ by less than 4E-4 (2.936855782120773e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803770691658365E-004) differ by less than 4E-4 (2.8438380874629132e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.977944e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.126754e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.003051e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.170734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287410019363280E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.9306s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5773s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3532s for 8192 events => throughput is 2.32E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3306s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8070s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5236s for 8192 events => throughput is 1.56E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287410019363280E-004) differ by less than 4E-4 (3.147273719417143e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803505046780143E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 5.7205s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8432s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8773s for 90112 events => throughput is 2.32E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3793s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5829s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7964s for 90112 events => throughput is 1.55E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803505046780143E-004) differ by less than 4E-4 (3.111103879849253e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.403600e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.587083e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.594558e+04 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 1.2103s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7457s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4646s for 8192 events => throughput is 1.76E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 7.6352s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.1134s for 90112 events => throughput is 1.76E+04 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.751909e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.397579e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.760230e+04 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 1.4750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8777s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5973s for 8192 events => throughput is 1.37E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396133530942E-004) differ by less than 4E-4 (3.2753885288450135e-06) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 9.1796s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7011s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4785s for 90112 events => throughput is 1.39E+04 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803777739454609E-004) differ by less than 4E-4 (3.2897959809652377e-06) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.411903e+04 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.410632e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287414523737644E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7020s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6445s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0575s for 8192 events => throughput is 1.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7967s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7754s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287414523737644E-004) differ by less than 4E-4 (3.271404610538653e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277400478491265E-004) differ by less than 4E-4 (3.395159378305479e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803508418967395E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.5342s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9016s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6326s for 90112 events => throughput is 1.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7835s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5473s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803508418967395E-004) differ by less than 4E-4 (3.324486780309499e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.432211783227501e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.439273e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.582485e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.523220e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.942798e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.697708e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.492976e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.275257e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638150e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.699598e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.493239e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.973303e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638925e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.680513e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.453709e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.315750e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.527726e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 9c049e812e..a372850ebe 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:22:43 +DATE: 2024-01-30_06:21:53 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 2.7330s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s - [COUNTERS] Fortran MEs ( 1 ) : 2.4986s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7321s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3006s + [COUNTERS] Fortran MEs ( 1 ) : 4.4315s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x1_fortran > /tmp/valassia/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295813310820E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.7304s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s - [COUNTERS] Fortran MEs ( 1 ) : 2.4984s for 8192 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s + [COUNTERS] Fortran MEs ( 1 ) : 4.4410s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttgg_x10_fortran > /tmp/valassia/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455880587238E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 28.9955s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4995s - [COUNTERS] Fortran MEs ( 1 ) : 27.4960s for 90112 events => throughput is 3.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8810s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1085s + [COUNTERS] Fortran MEs ( 1 ) : 48.7725s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295930626011E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 7.7609s - [COUNTERS] Fortran Overhead ( 0 ) : 3.9574s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.8034s for 8192 events => throughput is 2.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.6340s + [COUNTERS] Fortran Overhead ( 0 ) : 4.9027s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7313s for 8192 events => throughput is 1.73E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295930626011E-004) differ by less than 2E-4 (3.232954792764531e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277432965013E-004) differ by less than 2E-4 (3.352291999547674e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455945000286E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 47.1027s - [COUNTERS] Fortran Overhead ( 0 ) : 5.2266s - [COUNTERS] CudaCpp MEs ( 2 ) : 41.8761s for 90112 events => throughput is 2.15E+03 events/s + [COUNTERS] PROGRAM TOTAL : 58.7075s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6716s + [COUNTERS] CudaCpp MEs ( 2 ) : 52.0359s for 90112 events => throughput is 1.73E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455945000286E-004) differ by less than 2E-4 (4.075883630605404e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725813026107E-004) differ by less than 2E-4 (4.087956861908992e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202379e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.784786e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.202036e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.785841e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295929360709E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 3.4788s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8371s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.6417s for 8192 events => throughput is 4.99E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0010s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6173s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3837s for 8192 events => throughput is 3.44E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295929360709E-004) differ by less than 2E-4 (3.1980857961855236e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277430934459E-004) differ by less than 2E-4 (3.296318995538172e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455948191442E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 21.1084s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1042s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.0042s for 90112 events => throughput is 5.01E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6721s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4448s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2273s for 90112 events => throughput is 3.44E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455948191442E-004) differ by less than 2E-4 (4.277811438413437e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725816246315E-004) differ by less than 2E-4 (4.291719424287521e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.131114e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.519557e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.127519e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.539213e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295916873888E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.5800s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8982s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6819s for 8192 events => throughput is 1.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3065s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0298s for 8192 events => throughput is 7.96E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295916873888E-004) differ by less than 2E-4 (2.85397594446124e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455942593628E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 9.6911s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1649s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.5262s for 90112 events => throughput is 1.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 14.5206s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0822s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.4384s for 90112 events => throughput is 7.88E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455942593628E-004) differ by less than 2E-4 (3.923596780808225e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.236690e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.081577e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.110336e+03 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.0953s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1807s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9146s for 8192 events => throughput is 8.96E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 13.0092s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9623s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.0469s for 90112 events => throughput is 8.97E+03 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.305280e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.234370e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.278381e+03 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 59 events (found 420 events) + [COUNTERS] PROGRAM TOTAL : 2.6707s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4755s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1952s for 8192 events => throughput is 6.85E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 207 events (found 1235 events) + [COUNTERS] PROGRAM TOTAL : 16.3107s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2346s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0761s for 90112 events => throughput is 6.89E+03 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.979300e+03 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.981216e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x1_cudacpp > /tmp/valassia/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003629 [3.6287295792920187E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8446s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7292s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1153s for 8192 events => throughput is 7.10E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8344s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6287295813310820E-004) and cpp (3.6287295792920187E-004) differ by less than 2E-4 (5.619220644348388e-10) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084701E-004) differ by less than 2E-4 (5.03573627241849e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttgg_x10_cudacpp > /tmp/valassia/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803455870960301E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 3.2641s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9918s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2723s for 90112 events => throughput is 7.08E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.9283s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5651s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3632s for 90112 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803455880587238E-004) and cpp (1.5803455870960301E-004) differ by less than 2E-4 (6.091666060470402e-10) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131540830622839e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.186070e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.286633e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.461379e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.522365e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.243473e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.122442e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.016222e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.148040e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.240703e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.112221e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.229491e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.164785e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.241630e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.108117e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.381724e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430780e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index fac9a60d3b..bc47a109df 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-31_15:27:32 +DATE: 2024-01-30_06:27:59 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 54.8219s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4154s - [COUNTERS] Fortran MEs ( 1 ) : 54.4066s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.9143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 101.4320s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 54.7889s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3599s - [COUNTERS] Fortran MEs ( 1 ) : 54.4290s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.8572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4740s + [COUNTERS] Fortran MEs ( 1 ) : 101.3832s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 600.8898s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9391s - [COUNTERS] Fortran MEs ( 1 ) : 597.9506s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1118.0575s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3730s + [COUNTERS] Fortran MEs ( 1 ) : 1113.6844s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747608E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 174.2219s - [COUNTERS] Fortran Overhead ( 0 ) : 79.6854s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.5365s for 8192 events => throughput is 8.67E+01 events/s + [COUNTERS] PROGRAM TOTAL : 222.3358s + [COUNTERS] Fortran Overhead ( 0 ) : 102.7450s + [COUNTERS] CudaCpp MEs ( 2 ) : 119.5908s for 8192 events => throughput is 6.85E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747608E-006) differ by less than 2E-14 (2.220446049250313e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913627E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1125.3822s - [COUNTERS] Fortran Overhead ( 0 ) : 82.5529s - [COUNTERS] CudaCpp MEs ( 2 ) : 1042.8293s for 90112 events => throughput is 8.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1439.6055s + [COUNTERS] Fortran Overhead ( 0 ) : 107.7197s + [COUNTERS] CudaCpp MEs ( 2 ) : 1331.8857s for 90112 events => throughput is 6.77E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913627E-007) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813950E-007) differ by less than 2E-14 (1.3322676295501878e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.034374e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.948640e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.030138e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.570768e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747610E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 81.7577s - [COUNTERS] Fortran Overhead ( 0 ) : 36.8116s - [COUNTERS] CudaCpp MEs ( 2 ) : 44.9462s for 8192 events => throughput is 1.82E+02 events/s + [COUNTERS] PROGRAM TOTAL : 115.4155s + [COUNTERS] Fortran Overhead ( 0 ) : 52.8722s + [COUNTERS] CudaCpp MEs ( 2 ) : 62.5433s for 8192 events => throughput is 1.31E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747610E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913637E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 534.2423s - [COUNTERS] Fortran Overhead ( 0 ) : 39.3232s - [COUNTERS] CudaCpp MEs ( 2 ) : 494.9191s for 90112 events => throughput is 1.82E+02 events/s + [COUNTERS] PROGRAM TOTAL : 742.6112s + [COUNTERS] Fortran Overhead ( 0 ) : 56.7177s + [COUNTERS] CudaCpp MEs ( 2 ) : 685.8936s for 90112 events => throughput is 1.31E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913637E-007) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.255815e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.569503e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.262018e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.568080e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747600E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 34.9158s - [COUNTERS] Fortran Overhead ( 0 ) : 15.8355s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.0803s for 8192 events => throughput is 4.29E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.0772s + [COUNTERS] Fortran Overhead ( 0 ) : 24.7915s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2857s for 8192 events => throughput is 2.90E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747600E-006) differ by less than 2E-14 (1.5543122344752192e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 228.0263s - [COUNTERS] Fortran Overhead ( 0 ) : 18.3642s - [COUNTERS] CudaCpp MEs ( 2 ) : 209.6621s for 90112 events => throughput is 4.30E+02 events/s + [COUNTERS] PROGRAM TOTAL : 340.2848s + [COUNTERS] Fortran Overhead ( 0 ) : 28.6537s + [COUNTERS] CudaCpp MEs ( 2 ) : 311.6311s for 90112 events => throughput is 2.89E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913632E-007) differ by less than 2E-14 (0.0) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.298576e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.398486e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.399363e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 46.8647s + [COUNTERS] Fortran Overhead ( 0 ) : 21.6683s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.1965s for 8192 events => throughput is 3.25E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 302.3706s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4811s + [COUNTERS] CudaCpp MEs ( 2 ) : 276.8895s for 90112 events => throughput is 3.25E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.882449e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.317212e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.889860e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 50.0231s + [COUNTERS] Fortran Overhead ( 0 ) : 24.6335s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.3895s for 8192 events => throughput is 3.23E+02 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 308.5181s + [COUNTERS] Fortran Overhead ( 0 ) : 28.4720s + [COUNTERS] CudaCpp MEs ( 2 ) : 280.0461s for 90112 events => throughput is 3.22E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.386684e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.385729e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747604E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 11.6796s - [COUNTERS] Fortran Overhead ( 0 ) : 7.7273s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.9522s for 8192 events => throughput is 2.07E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2467s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1625s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0842s for 8192 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567358747604E-006) differ by less than 2E-14 (1.7763568394002505e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (1.9984014443252818e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 53.7849s - [COUNTERS] Fortran Overhead ( 0 ) : 10.3304s - [COUNTERS] CudaCpp MEs ( 2 ) : 43.4545s for 90112 events => throughput is 2.07E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.9518s + [COUNTERS] Fortran Overhead ( 0 ) : 7.0338s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9179s for 90112 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347510913632E-007) differ by less than 2E-14 (0.0) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813960E-007) differ by less than 2E-14 (8.881784197001252e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.097186e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.528868e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.160043e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.249701e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.470954e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.231891e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.436635e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.557033e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.490950e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.244700e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.435708e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.446855e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.482049e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.214530e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.114797e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.244468e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index e876937058..c35aa0a017 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' - make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-31_16:22:53 +DATE: 2024-01-30_07:59:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 54.7916s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3438s - [COUNTERS] Fortran MEs ( 1 ) : 54.4478s for 8192 events => throughput is 1.50E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.8466s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4779s + [COUNTERS] Fortran MEs ( 1 ) : 101.3687s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 54.7291s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3431s - [COUNTERS] Fortran MEs ( 1 ) : 54.3860s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.7818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4752s + [COUNTERS] Fortran MEs ( 1 ) : 101.3066s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 601.1562s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9495s - [COUNTERS] Fortran MEs ( 1 ) : 598.2067s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1118.6550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3767s + [COUNTERS] Fortran MEs ( 1 ) : 1114.2783s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694234561464155E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 162.5170s - [COUNTERS] Fortran Overhead ( 0 ) : 74.5337s - [COUNTERS] CudaCpp MEs ( 2 ) : 87.9834s for 8192 events => throughput is 9.31E+01 events/s + [COUNTERS] PROGRAM TOTAL : 205.5797s + [COUNTERS] Fortran Overhead ( 0 ) : 95.6059s + [COUNTERS] CudaCpp MEs ( 2 ) : 109.9738s for 8192 events => throughput is 7.45E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694234561464155E-006) differ by less than 4E-4 (0.00014258653941601196) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768374083672E-006) differ by less than 4E-4 (0.00014259935458071915) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361347375693870E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1043.3068s - [COUNTERS] Fortran Overhead ( 0 ) : 77.0455s - [COUNTERS] CudaCpp MEs ( 2 ) : 966.2612s for 90112 events => throughput is 9.33E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1305.3326s + [COUNTERS] Fortran Overhead ( 0 ) : 98.5377s + [COUNTERS] CudaCpp MEs ( 2 ) : 1206.7949s for 90112 events => throughput is 7.47E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361347375693870E-007) differ by less than 4E-4 (0.0001404539737310806) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435710758843E-007) differ by less than 4E-4 (0.0001404387438554977) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108680e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.692219e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111948e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.699275e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694233472680930E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 39.5436s - [COUNTERS] Fortran Overhead ( 0 ) : 18.1607s - [COUNTERS] CudaCpp MEs ( 2 ) : 21.3829s for 8192 events => throughput is 3.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 52.0497s + [COUNTERS] Fortran Overhead ( 0 ) : 24.6638s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3859s for 8192 events => throughput is 2.99E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694233472680930E-006) differ by less than 4E-4 (0.0001424934218661189) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694765360831655E-006) differ by less than 4E-4 (0.00014234165972015766) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361343132853083E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 255.6894s - [COUNTERS] Fortran Overhead ( 0 ) : 20.7025s - [COUNTERS] CudaCpp MEs ( 2 ) : 234.9869s for 90112 events => throughput is 3.83E+02 events/s + [COUNTERS] PROGRAM TOTAL : 336.4854s + [COUNTERS] Fortran Overhead ( 0 ) : 29.3396s + [COUNTERS] CudaCpp MEs ( 2 ) : 307.1459s for 90112 events => throughput is 2.93E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361343132853083E-007) differ by less than 4E-4 (0.00014025532349459802) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429212586563E-007) differ by less than 4E-4 (0.00014013450003202976) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.624456e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.371429e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.616503e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.391230e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694232419162335E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 17.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 8.2769s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.5699s for 8192 events => throughput is 8.56E+02 events/s + [COUNTERS] PROGRAM TOTAL : 27.0721s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6637s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.4085s for 8192 events => throughput is 5.69E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694232419162335E-006) differ by less than 4E-4 (0.0001424033203030195) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361342605756045E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 116.5354s - [COUNTERS] Fortran Overhead ( 0 ) : 10.8269s - [COUNTERS] CudaCpp MEs ( 2 ) : 105.7085s for 90112 events => throughput is 8.52E+02 events/s + [COUNTERS] PROGRAM TOTAL : 174.3265s + [COUNTERS] Fortran Overhead ( 0 ) : 16.5080s + [COUNTERS] CudaCpp MEs ( 2 ) : 157.8185s for 90112 events => throughput is 5.71E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361342605756045E-007) differ by less than 4E-4 (0.0001402306447575441) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.043447e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.735262e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.739370e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 23.8611s + [COUNTERS] Fortran Overhead ( 0 ) : 11.1738s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.6873s for 8192 events => throughput is 6.46E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 154.4819s + [COUNTERS] Fortran Overhead ( 0 ) : 15.1638s + [COUNTERS] CudaCpp MEs ( 2 ) : 139.3180s for 90112 events => throughput is 6.47E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.672552e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.039749e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.678478e+02 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 25.4092s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6834s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.7257s for 8192 events => throughput is 6.44E+02 events/s -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768276769753E-006) differ by less than 4E-4 (0.00014259103224434355) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 156.3015s + [COUNTERS] Fortran Overhead ( 0 ) : 16.4754s + [COUNTERS] CudaCpp MEs ( 2 ) : 139.8261s for 90112 events => throughput is 6.44E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435948756818E-007) differ by less than 4E-4 (0.00014044988689865257) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.776081e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.750726e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694234612933678E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 6.2810s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3959s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8851s for 8192 events => throughput is 4.35E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5003s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4984s for 8192 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1694234612933678E-006) differ by less than 4E-4 (0.0001425909413168558) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694770708194997E-006) differ by less than 4E-4 (0.00014279896898039546) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361349638985098E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 27.6605s - [COUNTERS] Fortran Overhead ( 0 ) : 6.8371s - [COUNTERS] CudaCpp MEs ( 2 ) : 20.8234s for 90112 events => throughput is 4.33E+03 events/s + [COUNTERS] PROGRAM TOTAL : 11.2881s + [COUNTERS] Fortran Overhead ( 0 ) : 5.8695s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4186s for 90112 events => throughput is 1.66E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1361349638985098E-007) differ by less than 4E-4 (0.00014055994125627969) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361443477565656E-007) differ by less than 4E-4 (0.00014080238503022535) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.339238e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.635547e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.447652e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.633264e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.122381e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.309560e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.385965e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.405493e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.151108e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.341562e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.855986e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.341458e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.113295e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.336833e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.045829e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.413620e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 8e208f24ab..b9faa14c51 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-01-31_17:07:16 +DATE: 2024-01-30_09:07:55 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 54.7288s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3430s - [COUNTERS] Fortran MEs ( 1 ) : 54.3859s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.9697s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] Fortran MEs ( 1 ) : 101.4926s for 8192 events => throughput is 8.07E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x1_fortran > /tmp/valassia/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567358747583E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 54.7250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3459s - [COUNTERS] Fortran MEs ( 1 ) : 54.3790s for 8192 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 101.6914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4764s + [COUNTERS] Fortran MEs ( 1 ) : 101.2150s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_ggttggg_x10_fortran > /tmp/valassia/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347510913632E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 601.1975s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9597s - [COUNTERS] Fortran MEs ( 1 ) : 598.2379s for 90112 events => throughput is 1.51E+02 events/s + [COUNTERS] PROGRAM TOTAL : 1118.2550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3831s + [COUNTERS] Fortran MEs ( 1 ) : 1113.8719s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567430116567E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 175.6033s - [COUNTERS] Fortran Overhead ( 0 ) : 80.7142s - [COUNTERS] CudaCpp MEs ( 2 ) : 94.8891s for 8192 events => throughput is 8.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 224.2502s + [COUNTERS] Fortran Overhead ( 0 ) : 103.5045s + [COUNTERS] CudaCpp MEs ( 2 ) : 120.7457s for 8192 events => throughput is 6.78E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567430116567E-006) differ by less than 2E-4 (6.103790806122333e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101016896844E-006) differ by less than 2E-4 (6.1113847316107694e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347627977553E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1120.6920s - [COUNTERS] Fortran Overhead ( 0 ) : 82.8854s - [COUNTERS] CudaCpp MEs ( 2 ) : 1037.8066s for 90112 events => throughput is 8.68E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1439.9019s + [COUNTERS] Fortran Overhead ( 0 ) : 107.2148s + [COUNTERS] CudaCpp MEs ( 2 ) : 1332.6870s for 90112 events => throughput is 6.76E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347627977553E-007) differ by less than 2E-4 (5.480944587077374e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.026980e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.977492e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.025112e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.962894e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567434129498E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 78.7358s - [COUNTERS] Fortran Overhead ( 0 ) : 35.0931s - [COUNTERS] CudaCpp MEs ( 2 ) : 43.6427s for 8192 events => throughput is 1.88E+02 events/s + [COUNTERS] PROGRAM TOTAL : 114.2875s + [COUNTERS] Fortran Overhead ( 0 ) : 54.1504s + [COUNTERS] CudaCpp MEs ( 2 ) : 60.1370s for 8192 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567434129498E-006) differ by less than 2E-4 (6.446994271769313e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658363352905e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347636244846E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 518.4869s - [COUNTERS] Fortran Overhead ( 0 ) : 37.5672s - [COUNTERS] CudaCpp MEs ( 2 ) : 480.9197s for 90112 events => throughput is 1.87E+02 events/s + [COUNTERS] PROGRAM TOTAL : 713.5498s + [COUNTERS] Fortran Overhead ( 0 ) : 58.1042s + [COUNTERS] CudaCpp MEs ( 2 ) : 655.4456s for 90112 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347636244846E-007) differ by less than 2E-4 (5.868020069854651e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436284111587E-007) differ by less than 2E-4 (5.866422458922216e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.361474e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.524488e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.358615e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.529646e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567435042426E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 34.2555s - [COUNTERS] Fortran Overhead ( 0 ) : 15.3193s - [COUNTERS] CudaCpp MEs ( 2 ) : 18.9362s for 8192 events => throughput is 4.33E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.7719s + [COUNTERS] Fortran Overhead ( 0 ) : 23.4236s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3483s for 8192 events => throughput is 3.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567435042426E-006) differ by less than 2E-4 (6.525072038243707e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347633600335E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 225.2695s - [COUNTERS] Fortran Overhead ( 0 ) : 17.7559s - [COUNTERS] CudaCpp MEs ( 2 ) : 207.5136s for 90112 events => throughput is 4.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 326.3386s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1544s + [COUNTERS] CudaCpp MEs ( 2 ) : 299.1842s for 90112 events => throughput is 3.01E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347633600335E-007) differ by less than 2E-4 (5.744204001345565e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.547813e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.567666e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.597479e+02 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 45.3921s + [COUNTERS] Fortran Overhead ( 0 ) : 20.5216s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8705s for 8192 events => throughput is 3.29E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 301.9509s + [COUNTERS] Fortran Overhead ( 0 ) : 24.4062s + [COUNTERS] CudaCpp MEs ( 2 ) : 277.5446s for 90112 events => throughput is 3.25E+02 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.101718e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.519857e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.081457e+02 ) sec^-1 -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 15 events (found 163 events) + [COUNTERS] PROGRAM TOTAL : 48.9503s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9070s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.0433s for 8192 events => throughput is 3.27E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 84 events (found 808 events) + [COUNTERS] PROGRAM TOTAL : 301.8849s + [COUNTERS] Fortran Overhead ( 0 ) : 27.8392s + [COUNTERS] CudaCpp MEs ( 2 ) : 274.0457s for 90112 events => throughput is 3.29E+02 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.501339e+02 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.509355e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,22 +505,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x1_cudacpp > /tmp/valassia/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1692567356511786E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 12.6233s - [COUNTERS] Fortran Overhead ( 0 ) : 8.2752s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3480s for 8192 events => throughput is 1.88E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5767s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8630s for 8192 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1692567358747583E-006) and cpp (1.1692567356511786E-006) differ by less than 2E-4 (1.9121515482112272e-10) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.279223476620018e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -390,65 +538,65 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_ggttggg_x10_cudacpp > /tmp/valassia/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358347509627304E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 58.4466s - [COUNTERS] Fortran Overhead ( 0 ) : 10.6616s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.7850s for 90112 events => throughput is 1.89E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.0811s + [COUNTERS] Fortran Overhead ( 0 ) : 6.5876s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4935s for 90112 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358347510913632E-007) and cpp (2.1358347509627304E-007) differ by less than 2E-4 (6.022604637223594e-11) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173717093105324e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.904439e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.427839e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.931601e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.087147e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.220929e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.109838e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.368320e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.157967e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.250028e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.106789e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.156635e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.114403e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.216963e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111816e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.043067e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.650481e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index aefc17f4c0..1fb13570ed 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-31_15:26:32 +DATE: 2024-01-30_06:26:25 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3177s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2695s - [COUNTERS] Fortran MEs ( 1 ) : 0.0483s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3322s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s + [COUNTERS] Fortran MEs ( 1 ) : 0.0741s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2503s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2021s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3239s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2500s + [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6969s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1709s - [COUNTERS] Fortran MEs ( 1 ) : 0.5261s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4169s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6004s + [COUNTERS] Fortran MEs ( 1 ) : 0.8166s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554313] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3419s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2721s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0698s for 8192 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3380s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0825s for 8192 events => throughput is 9.92E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452069554313) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703710) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2460s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7672s for 90112 events => throughput is 1.17E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6157s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9063s for 90112 events => throughput is 9.94E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276513379142) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.199571e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.011343e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.201530e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.009603e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554313] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2704s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2370s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0334s for 8192 events => throughput is 2.46E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452069554313) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.5755s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2083s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3672s for 90112 events => throughput is 2.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1314s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6642s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4672s for 90112 events => throughput is 1.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276513379142) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.481551e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.951695e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.483003e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.960178e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554313] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2382s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2210s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3038s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452069554313) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276513379150] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.3855s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1955s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1900s for 90112 events => throughput is 4.74E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2754s for 90112 events => throughput is 3.27E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276513379150) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.783829e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.305728e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.868821e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.308522e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2978s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0219s for 8192 events => throughput is 3.74E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.8887s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6471s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2415s for 90112 events => throughput is 3.73E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.792718e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.861917e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.3207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0189s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3622s for 90112 events => throughput is 2.49E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.535117e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.517520e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.6879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6872s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0638s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0555s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 90112 events => throughput is 1.08E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.535376e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.131781e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.380880e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.511409e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.374806e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.787335e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.380768e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.782273e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index e3c6b9eae7..4985f151b2 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-31_15:26:53 +DATE: 2024-01-30_06:26:56 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.2535s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2053s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3277s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2529s + [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2506s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2024s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3225s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2485s + [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6972s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1710s - [COUNTERS] Fortran MEs ( 1 ) : 0.5262s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4071s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5926s + [COUNTERS] Fortran MEs ( 1 ) : 0.8145s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049433846970949] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3163s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0567s for 8192 events => throughput is 1.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4015s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3277s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049433846970949) differ by less than 4E-4 (6.995380679164498e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050314903825744) differ by less than 4E-4 (7.065505747139156e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276051306751] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8565s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2329s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6236s for 90112 events => throughput is 1.44E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5139s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7000s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8139s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276051306751) differ by less than 4E-4 (2.119474018513756e-08) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801181770186087) differ by less than 4E-4 (4.0292758352045155e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.470346e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.131056e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.471180e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.131141e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049432213942514] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2433s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2233s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 8192 events => throughput is 3.04E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049432213942514) differ by less than 4E-4 (7.622276181340482e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310835231938) differ by less than 4E-4 (8.627325996934943e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801274044068764] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.4175s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2206s for 90112 events => throughput is 4.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9246s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2778s for 90112 events => throughput is 3.24E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801274044068764) differ by less than 4E-4 (1.1326448601245431e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177817838580) differ by less than 4E-4 (2.2158326773435988e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.277540e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.299610e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.287798e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.290596e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049432091919483] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2232s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2135s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.52E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2672s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049432091919483) differ by less than 4E-4 (7.66911902094769e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801273719964992] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.2900s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1843s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1057s for 90112 events => throughput is 8.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7784s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1459s for 90112 events => throughput is 6.18E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801273719964992) differ by less than 4E-4 (1.2813076100126608e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.719343e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.309270e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.796795e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.331612e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2783s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2663s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0120s for 8192 events => throughput is 6.84E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.7659s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6337s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1322s for 90112 events => throughput is 6.82E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.987405e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.079276e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2869s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050317064561834) differ by less than 4E-4 (6.236059127973093e-07) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.8266s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6389s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1877s for 90112 events => throughput is 4.80E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674269399215e-08) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.948471e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.943070e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.6865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050319131407651) differ by less than 4E-4 (5.442654378295941e-07) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0580s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.646730e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.486298e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.789487e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.699254e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.776807e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.780761e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.361160e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.995376e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index ccfe354c14..44df8a9e3d 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-01-31_15:27:12 +DATE: 2024-01-30_06:27:27 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: -Working directory (run): /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -50,18 +50,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.2533s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2051s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3260s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2519s + [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -75,18 +75,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x1_fortran > /tmp/valassia/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452069554319] fbridge_mode=0 + [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2507s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2026s - [COUNTERS] Fortran MEs ( 1 ) : 0.0482s for 8192 events => throughput is 1.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3236s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2490s + [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -100,18 +100,18 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./madevent_fortran < /tmp/valassia/input_gqttq_x10_fortran > /tmp/valassia/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276513379142] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6990s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1727s - [COUNTERS] Fortran MEs ( 1 ) : 0.5263s for 90112 events => throughput is 1.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5940s + [COUNTERS] Fortran MEs ( 1 ) : 0.8130s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -125,22 +125,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452042320692] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3441s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2747s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0695s for 8192 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3367s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0829s for 8192 events => throughput is 9.88E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452042320692) differ by less than 2E-4 (1.0454587195951603e-09) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657206) differ by less than 2E-4 (1.0382404935782574e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -158,36 +158,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276501483957] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0473s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2845s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7628s for 90112 events => throughput is 1.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6225s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7110s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9116s for 90112 events => throughput is 9.89E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276501483957) differ by less than 2E-4 (5.456187723851258e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608801) differ by less than 2E-4 (5.507531097848073e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.196605e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.000214e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.195912e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.992747e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -201,22 +201,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452042320692] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2698s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2370s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0328s for 8192 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452042320692) differ by less than 2E-4 (1.0454587195951603e-09) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657212) differ by less than 2E-4 (1.0382402715336525e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -234,36 +234,36 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276501483957] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.5686s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2079s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3607s for 90112 events => throughput is 2.50E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6727s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4735s for 90112 events => throughput is 1.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276501483957) differ by less than 2E-4 (5.456187723851258e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608804) differ by less than 2E-4 (5.507529987625048e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.500614e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.936317e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.499482e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.928004e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -277,22 +277,22 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26049452049989580] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2381s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2209s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0172s for 8192 events => throughput is 4.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26049452069554319) and cpp (0.26049452049989580) differ by less than 2E-4 (7.510614352668199e-10) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -310,40 +310,188 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/valassia/input_gqttq_x10_cudacpp > /tmp/valassia/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/128 +Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801276503688793] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.3765s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1872s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1893s for 90112 events => throughput is 4.76E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9238s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2724s for 90112 events => throughput is 3.31E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801276513379142) and cpp (0.21801276503688793) differ by less than 2E-4 (4.4448533742524887e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.826787e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.358469e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.857511e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.409112e+05 ) sec^-1 + +*** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.2986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) + +*** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical -*** (2-512y) WARNING! SKIP MADEVENT_CPP (512y is not supported on this node) *** +*** (2-512y) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 1.8897s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s + +*** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) + +*** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.861136e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.873456e+05 ) sec^-1 + +*** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.3231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 8192 events => throughput is 2.42E+05 events/s -*** (2-512z) WARNING! SKIP MADEVENT_CPP (512z is not supported on this node) *** +*** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333291481387) differ by less than 2E-4 (6.99504676404672e-10) + +*** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.1 and events.lhe.ref.1 are identical + +*** (2-512z) EXECUTE MADEVENT_CPP x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0365s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6643s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3722s for 90112 events => throughput is 2.42E+05 events/s + +*** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) + +*** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical + +*** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.448669e+05 ) sec^-1 + +*** EXECUTE CHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.422091e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -357,15 +505,98 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 0 ! Helicity Sum/event 0=exact 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- -Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' -ERROR! ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/valassia/input_gqttq_x1_cudacpp > /tmp/valassia/output_gqttq_x1_cudacpp' failed - PDF set = nn23lo1 - alpha_s(Mz)= 0.1300 running at 2 loops. - alpha_s(Mz)= 0.1300 running at 2 loops. - Renormalization scale set on event-by-event basis - Factorization scale set on event-by-event basis +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 + [UNWEIGHT] Wrote 81 events (found 540 events) + [COUNTERS] PROGRAM TOTAL : 0.6889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029699) differ by less than 2E-4 (3.329714282074292e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 + [UNWEIGHT] Wrote 853 events (found 1849 events) + [COUNTERS] PROGRAM TOTAL : 2.0663s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 90112 events => throughput is 1.10E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182637219937) differ by less than 2E-4 (5.227208665914418e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.534715e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.123919e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.382422e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.503129e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.385930e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.826918e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.379565e+07 ) sec^-1 +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.789199e+07 ) sec^-1 - getting user params -Enter number of events and max and min iterations: - Number of events and iterations 8192 1 1 +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index d8bb554a39..15dbd5f8d1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_13:48:01 +DATE: 2024-01-30_04:51:46 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.295982e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.113400e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.341424e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.999466 sec - 15,384,344,417 cycles:u # 2.941 GHz (74.97%) - 53,752,468 stalled-cycles-frontend:u # 0.35% frontend cycles idle (75.02%) - 6,944,996,178 stalled-cycles-backend:u # 45.14% backend cycles idle (75.06%) - 11,608,343,150 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (74.87%) - 5.541765598 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.572573e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281942e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.116391e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.839714 sec + 2,719,217,340 cycles # 2.832 GHz + 4,277,615,433 instructions # 1.57 insn per cycle + 1.175143775 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.249676e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.429139e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.429139e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.771162 sec - 19,518,544,594 cycles:u # 3.364 GHz (74.91%) - 49,953,673 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.97%) - 62,395,027 stalled-cycles-backend:u # 0.32% backend cycles idle (75.04%) - 46,990,078,774 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 5.805858662 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.879157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147243e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.790847 sec + 19,539,640,504 cycles # 2.876 GHz + 46,935,351,432 instructions # 2.40 insn per cycle + 6.804517518 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.926426e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.430773e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.430773e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.000237 sec - 13,293,955,872 cycles:u # 3.297 GHz (75.00%) - 49,498,965 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.00%) - 995,469,090 stalled-cycles-backend:u # 7.49% backend cycles idle (75.01%) - 31,161,260,421 instructions:u # 2.34 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 4.036423376 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.545376e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021398e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.488904 sec + 12,869,370,410 cycles # 2.864 GHz + 31,186,180,279 instructions # 2.42 insn per cycle + 4.505888529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.653277e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.528834e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.528834e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.105945 sec - 10,166,660,682 cycles:u # 3.240 GHz (74.93%) - 48,671,590 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) - 439,852,148 stalled-cycles-backend:u # 4.33% backend cycles idle (75.02%) - 19,408,273,106 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (75.02%) - 3.142037322 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.955981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.735873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.735873e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.633222 sec + 10,032,348,170 cycles # 2.758 GHz + 19,481,701,848 instructions # 1.94 insn per cycle + 3.651370321 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.070263e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.978600e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.978600e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.453661 sec + 9,572,367,477 cycles # 2.767 GHz + 18,943,715,958 instructions # 1.98 insn per cycle + 3.473553059 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.819162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.469996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.469996e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.879359 sec + 8,193,098,191 cycles # 2.110 GHz + 15,513,331,501 instructions # 1.89 insn per cycle + 3.898953032 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index eca66f0c00..f78ea7251e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:38:29 +DATE: 2024-01-30_05:45:26 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.488604e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.339655e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.339655e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.547931 sec - 18,342,913,114 cycles:u # 3.284 GHz (74.95%) - 120,415,042 stalled-cycles-frontend:u # 0.66% frontend cycles idle (74.96%) - 6,982,028,143 stalled-cycles-backend:u # 38.06% backend cycles idle (75.02%) - 17,140,288,864 instructions:u # 0.93 insn per cycle - # 0.41 stalled cycles per insn (75.08%) - 5.611856863 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.460171e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.485962e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.319187 sec + 7,341,770,811 cycles # 2.857 GHz + 13,101,723,847 instructions # 1.78 insn per cycle + 2.628471382 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.233619e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.408095e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.408095e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.950601 sec - 19,929,648,484 cycles:u # 3.325 GHz (74.91%) - 51,037,859 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.95%) - 111,782,691 stalled-cycles-backend:u # 0.56% backend cycles idle (75.01%) - 47,317,725,455 instructions:u # 2.37 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 5.995864091 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.576223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107198e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107198e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 7.190455 sec + 20,703,597,440 cycles # 2.877 GHz + 47,160,901,733 instructions # 2.28 insn per cycle + 7.198222207 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.869740e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.339491e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.339491e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.242404 sec - 13,932,338,033 cycles:u # 3.251 GHz (75.02%) - 51,648,161 stalled-cycles-frontend:u # 0.37% frontend cycles idle (75.00%) - 1,023,128,866 stalled-cycles-backend:u # 7.34% backend cycles idle (74.99%) - 31,992,864,934 instructions:u # 2.30 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 4.289164967 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.473769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.897978e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.897978e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.898106 sec + 14,084,591,919 cycles # 2.873 GHz + 32,028,151,491 instructions # 2.27 insn per cycle + 4.906157596 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.545589e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.346989e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.346989e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.357207 sec - 10,846,967,886 cycles:u # 3.190 GHz (74.86%) - 50,608,240 stalled-cycles-frontend:u # 0.47% frontend cycles idle (74.97%) - 521,276,562 stalled-cycles-backend:u # 4.81% backend cycles idle (75.07%) - 20,691,777,146 instructions:u # 1.91 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 3.403927192 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.834615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.502061e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.502061e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.065584 sec + 11,264,443,170 cycles # 2.767 GHz + 20,844,723,129 instructions # 1.85 insn per cycle + 4.073296839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.930005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.695920e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.695920e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.900573 sec + 10,821,072,419 cycles # 2.771 GHz + 20,305,054,668 instructions # 1.88 insn per cycle + 3.908355042 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.707724e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.274502e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274502e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.333313 sec + 9,497,951,325 cycles # 2.189 GHz + 16,666,820,850 instructions # 1.75 insn per cycle + 4.341233179 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index c2faab2d60..f072467bfa 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:52:23 +DATE: 2024-01-30_05:59:18 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.261681e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.110113e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.338630e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.483909e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.562012e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.071690e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.688984 sec - 15,405,560,202 cycles:u # 3.276 GHz (74.94%) - 53,825,141 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.99%) - 6,937,223,184 stalled-cycles-backend:u # 45.03% backend cycles idle (75.00%) - 11,638,228,833 instructions:u # 0.76 insn per cycle - # 0.60 stalled cycles per insn (75.00%) - 4.746802261 seconds time elapsed +TOTAL : 1.371489 sec + 4,620,404,364 cycles # 2.861 GHz + 7,153,271,516 instructions # 1.55 insn per cycle + 1.672602435 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.249339e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.428218e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.428218e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.952512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155636e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.773449 sec - 19,539,350,931 cycles:u # 3.366 GHz (74.92%) - 50,052,041 stalled-cycles-frontend:u # 0.26% frontend cycles idle (74.97%) - 54,884,087 stalled-cycles-backend:u # 0.28% backend cycles idle (75.04%) - 46,993,014,397 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 5.807755800 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.107803 sec + 20,592,800,911 cycles # 2.895 GHz + 47,037,031,319 instructions # 2.28 insn per cycle + 7.114495241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.921355e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.420017e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.420017e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.558277e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.038534e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.038534e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.007344 sec - 13,356,890,506 cycles:u # 3.307 GHz (74.88%) - 49,517,641 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.98%) - 1,103,447,662 stalled-cycles-backend:u # 8.26% backend cycles idle (75.04%) - 31,130,342,637 instructions:u # 2.33 insn per cycle - # 0.04 stalled cycles per insn (75.05%) - 4.041464440 seconds time elapsed +TOTAL : 4.822482 sec + 13,870,774,877 cycles # 2.874 GHz + 31,186,249,487 instructions # 2.25 insn per cycle + 4.828845646 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.656218e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.533268e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533268e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951724e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.730389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730389e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.104627 sec - 10,149,444,998 cycles:u # 3.236 GHz (75.01%) - 48,969,969 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.01%) - 452,522,891 stalled-cycles-backend:u # 4.46% backend cycles idle (75.01%) - 19,371,433,086 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (75.03%) - 3.138941761 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +TOTAL : 4.015384 sec + 11,119,337,735 cycles # 2.766 GHz + 19,381,852,554 instructions # 1.74 insn per cycle + 4.022009475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.063314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951443e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 3.845408 sec + 10,662,597,452 cycles # 2.769 GHz + 18,643,141,459 instructions # 1.75 insn per cycle + 3.852109381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.811483e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.460421e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.460421e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 4.272939 sec + 9,279,488,955 cycles # 2.169 GHz + 15,212,537,826 instructions # 1.64 insn per cycle + 4.279485071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 280278479d..a6db5de426 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:50:03 +DATE: 2024-01-30_05:55:54 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.492089e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.565509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.085712e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.999229 sec + 3,503,665,967 cycles # 2.851 GHz + 7,040,796,455 instructions # 2.01 insn per cycle + 1.289089254 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 53,192,877 cycles:u # 2.433 GHz (63.44%) - 38,671 stalled-cycles-frontend:u # 0.07% frontend cycles idle (63.44%) - 642,394 stalled-cycles-backend:u # 1.21% backend cycles idle (63.44%) - 41,176,822 instructions:u # 0.77 insn per cycle - # 0.02 stalled cycles per insn (65.67%) - 0.022859651 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted - 56,283,909 cycles:u # 2.613 GHz (62.89%) - 41,850 stalled-cycles-frontend:u # 0.07% frontend cycles idle (62.90%) - 601,698 stalled-cycles-backend:u # 1.07% backend cycles idle (62.90%) - 42,399,243 instructions:u # 0.75 insn per cycle - # 0.01 stalled cycles per insn (59.04%) - 0.022848500 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.897604e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152411e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.777699 sec + 19,525,012,140 cycles # 2.879 GHz + 46,935,602,227 instructions # 2.40 insn per cycle + 6.784496054 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted - 56,518,310 cycles:u # 2.612 GHz (63.07%) - 43,325 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.07%) - 585,678 stalled-cycles-backend:u # 1.04% backend cycles idle (63.07%) - 42,553,804 instructions:u # 0.75 insn per cycle - # 0.01 stalled cycles per insn (58.39%) - 0.023052274 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.565929e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.046315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046315e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.431371 sec + 12,844,580,525 cycles # 2.895 GHz + 31,183,505,413 instructions # 2.43 insn per cycle + 4.438022505 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted - 51,166,849 cycles:u # 2.361 GHz (63.11%) - 45,358 stalled-cycles-frontend:u # 0.09% frontend cycles idle (63.11%) - 595,939 stalled-cycles-backend:u # 1.16% backend cycles idle (63.11%) - 43,196,692 instructions:u # 0.84 insn per cycle - # 0.01 stalled cycles per insn (64.81%) - 0.023009761 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.956069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738681e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738681e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.632695 sec + 10,040,197,478 cycles # 2.761 GHz + 19,480,754,402 instructions # 1.94 insn per cycle + 3.639336589 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068909e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.973543e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.973543e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.456026 sec + 9,583,252,780 cycles # 2.770 GHz + 18,943,299,087 instructions # 1.98 insn per cycle + 3.462550493 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.820163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.473451e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.473451e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.874228 sec + 8,184,248,497 cycles # 2.110 GHz + 15,512,168,002 instructions # 1.90 insn per cycle + 3.880483923 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index 716313b078..4dded3e862 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:46:17 +DATE: 2024-01-30_05:52:26 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.521065e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.087989e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.316527e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.382298 sec - 17,842,680,178 cycles:u # 3.292 GHz (75.07%) - 118,997,598 stalled-cycles-frontend:u # 0.67% frontend cycles idle (75.07%) - 6,884,127,997 stalled-cycles-backend:u # 38.58% backend cycles idle (75.06%) - 16,790,757,485 instructions:u # 0.94 insn per cycle - # 0.41 stalled cycles per insn (75.03%) - 5.439280481 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.831383e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.529080e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.990768e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.936415 sec + 6,196,996,673 cycles # 2.858 GHz + 11,355,646,527 instructions # 1.83 insn per cycle + 2.226164304 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.250417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.429575e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.429575e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.768884 sec - 19,515,269,263 cycles:u # 3.365 GHz (74.95%) - 50,331,013 stalled-cycles-frontend:u # 0.26% frontend cycles idle (75.02%) - 60,390,410 stalled-cycles-backend:u # 0.31% backend cycles idle (75.04%) - 47,000,896,839 instructions:u # 2.41 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.803374749 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 471) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.923680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152570e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152570e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.755774 sec + 19,508,468,124 cycles # 2.886 GHz + 46,934,079,079 instructions # 2.41 insn per cycle + 6.762162730 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.919522e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.420724e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.420724e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.012332 sec - 13,381,132,630 cycles:u # 3.309 GHz (74.88%) - 49,972,164 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.89%) - 1,038,416,264 stalled-cycles-backend:u # 7.76% backend cycles idle (74.99%) - 31,108,050,469 instructions:u # 2.32 insn per cycle - # 0.03 stalled cycles per insn (75.08%) - 4.046628929 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.560350e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041132e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.445978 sec + 12,824,682,223 cycles # 2.881 GHz + 31,183,984,467 instructions # 2.43 insn per cycle + 4.452647644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.656353e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.534875e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.103048 sec - 10,154,943,843 cycles:u # 3.239 GHz (74.99%) - 49,102,982 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.99%) - 434,988,979 stalled-cycles-backend:u # 4.28% backend cycles idle (74.99%) - 19,375,707,710 instructions:u # 1.91 insn per cycle - # 0.02 stalled cycles per insn (75.02%) - 3.137538734 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1946) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.945035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.719021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719021e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.651562 sec + 10,054,417,482 cycles # 2.750 GHz + 19,480,651,159 instructions # 1.94 insn per cycle + 3.658175830 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.065244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.964476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.964476e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.463334 sec + 9,575,609,591 cycles # 2.761 GHz + 18,944,249,093 instructions # 1.98 insn per cycle + 3.469928809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.819790e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.476564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.476564e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.875473 sec + 8,194,000,405 cycles # 2.112 GHz + 15,512,267,676 instructions # 1.89 insn per cycle + 3.882168596 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 78355813e9..9238de7bbb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_13:48:32 +DATE: 2024-01-30_04:52:22 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.593715e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.569869e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.888804e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.652863 sec - 15,320,835,933 cycles:u # 3.274 GHz (74.95%) - 53,752,772 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.99%) - 6,932,962,718 stalled-cycles-backend:u # 45.25% backend cycles idle (75.06%) - 11,508,742,478 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (75.06%) - 4.711242016 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.433269e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304294e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.211626e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.708580 sec + 2,678,035,833 cycles # 2.828 GHz + 4,219,258,618 instructions # 1.58 insn per cycle + 1.025396427 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165206E-002 +Relative difference = 1.027708011645137e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.320903e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.522950e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.522950e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.497632 sec - 18,560,081,042 cycles:u # 3.357 GHz (74.97%) - 51,404,905 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.98%) - 64,148,698 stalled-cycles-backend:u # 0.35% backend cycles idle (74.98%) - 44,859,448,593 instructions:u # 2.42 insn per cycle - # 0.00 stalled cycles per insn (74.90%) - 5.532302089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 485) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.057712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.240764e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240764e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.363915 sec + 18,420,155,453 cycles # 2.892 GHz + 44,716,833,361 instructions # 2.43 insn per cycle + 6.376789264 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.012252e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.561147e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.561147e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.861847 sec - 12,799,524,672 cycles:u # 3.287 GHz (74.95%) - 48,923,305 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.95%) - 107,403,190 stalled-cycles-backend:u # 0.84% backend cycles idle (74.94%) - 30,132,025,093 instructions:u # 2.35 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 3.898000257 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.624136e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.147437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.147437e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.286124 sec + 12,429,118,549 cycles # 2.897 GHz + 30,107,231,858 instructions # 2.42 insn per cycle + 4.302706533 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.591787e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.432648e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.432648e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.163229 sec - 10,369,794,084 cycles:u # 3.245 GHz (74.99%) - 49,953,629 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.97%) - 285,363,281 stalled-cycles-backend:u # 2.75% backend cycles idle (74.98%) - 19,016,184,956 instructions:u # 1.83 insn per cycle - # 0.02 stalled cycles per insn (74.98%) - 3.199312894 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1884) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.942189e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.705004e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.705004e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.656079 sec + 10,127,428,804 cycles # 2.766 GHz + 19,115,519,637 instructions # 1.89 insn per cycle + 3.673885868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.094903e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.039710e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039710e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.417483 sec + 9,477,381,758 cycles # 2.768 GHz + 18,489,351,216 instructions # 1.95 insn per cycle + 3.434681568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.183418e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.193735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193735e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.298580 sec + 7,210,521,695 cycles # 2.182 GHz + 13,864,693,183 instructions # 1.92 insn per cycle + 3.315590461 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index f09c7ac494..09e3552971 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:19:13 +DATE: 2024-01-30_05:33:56 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.293150e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.104547e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.333336e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.667156 sec - 15,394,349,846 cycles:u # 3.274 GHz (75.07%) - 53,848,125 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.95%) - 6,942,774,491 stalled-cycles-backend:u # 45.10% backend cycles idle (74.94%) - 11,571,980,671 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (74.99%) - 4.724485567 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.454720e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.590982e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126095e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.682889 sec + 2,611,559,388 cycles # 2.831 GHz + 3,986,840,129 instructions # 1.53 insn per cycle + 0.986209294 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165208E-002 -Relative difference = 1.0277079981222336e-08 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.776160e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.164284e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.164284e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.280355 sec - 14,197,543,075 cycles:u # 3.292 GHz (74.96%) - 45,822,161 stalled-cycles-frontend:u # 0.32% frontend cycles idle (74.97%) - 534,603,013 stalled-cycles-backend:u # 3.77% backend cycles idle (74.96%) - 36,933,424,641 instructions:u # 2.60 insn per cycle - # 0.01 stalled cycles per insn (74.96%) - 4.314915144 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.350945e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.669369e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.669369e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.075739 sec + 14,632,134,397 cycles # 2.880 GHz + 36,697,212,873 instructions # 2.51 insn per cycle + 5.082665504 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.400347e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.238738e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.238738e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.352610 sec - 11,041,411,662 cycles:u # 3.262 GHz (74.85%) - 49,430,935 stalled-cycles-frontend:u # 0.45% frontend cycles idle (74.85%) - 65,289,997 stalled-cycles-backend:u # 0.59% backend cycles idle (74.95%) - 24,715,980,656 instructions:u # 2.24 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.388700835 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.975416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.812212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.812212e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.599579 sec + 10,391,716,980 cycles # 2.883 GHz + 24,753,509,930 instructions # 2.38 insn per cycle + 3.606361950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.998612e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.177017e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.177017e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.834155 sec - 9,225,543,059 cycles:u # 3.218 GHz (74.93%) - 49,920,264 stalled-cycles-frontend:u # 0.54% frontend cycles idle (74.89%) - 523,058,244 stalled-cycles-backend:u # 5.67% backend cycles idle (74.88%) - 16,856,415,715 instructions:u # 1.83 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 2.870336452 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.206864e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274609e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274609e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.273737 sec + 8,884,033,270 cycles # 2.722 GHz + 16,960,441,009 instructions # 1.91 insn per cycle + 3.280558312 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.436675e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.780065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.780065e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.997375 sec + 8,315,936,313 cycles # 2.769 GHz + 16,298,181,743 instructions # 1.96 insn per cycle + 3.004046425 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.987391e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.794180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.794180e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.583817 sec + 7,670,874,044 cycles # 2.137 GHz + 14,352,448,248 instructions # 1.87 insn per cycle + 3.590538974 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 25c71260e9..508008a0c5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:19:42 +DATE: 2024-01-30_05:34:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.862243e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.571665e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.890778e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.656807 sec - 15,360,030,989 cycles:u # 3.275 GHz (74.94%) - 53,784,472 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.92%) - 6,957,453,969 stalled-cycles-backend:u # 45.30% backend cycles idle (74.93%) - 11,504,128,443 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (74.93%) - 4.716015553 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.464301e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.594213e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.177261e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.680513 sec + 2,594,214,158 cycles # 2.833 GHz + 3,992,420,158 instructions # 1.54 insn per cycle + 0.978034885 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039868165216E-002 -Relative difference = 1.0277079305077159e-08 +Avg ME (F77/CUDA) = 1.2828039868165206E-002 +Relative difference = 1.027708011645137e-08 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.430798e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.218429e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.218429e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.325483 sec - 10,899,459,723 cycles:u # 3.247 GHz (74.98%) - 51,006,099 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.00%) - 55,681,565 stalled-cycles-backend:u # 0.51% backend cycles idle (74.88%) - 28,438,633,768 instructions:u # 2.61 insn per cycle - # 0.00 stalled cycles per insn (74.86%) - 3.359880853 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.895468e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.581482e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.581482e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.738704 sec + 10,794,188,443 cycles # 2.885 GHz + 28,356,720,092 instructions # 2.63 insn per cycle + 3.745371478 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.619196e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.647337e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.647337e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.139951 sec - 10,274,909,594 cycles:u # 3.238 GHz (75.04%) - 49,997,017 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.04%) - 73,156,824 stalled-cycles-backend:u # 0.71% backend cycles idle (75.04%) - 21,493,261,529 instructions:u # 2.09 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 3.176692165 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.231818e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.360148e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.360148e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.232648 sec + 9,331,358,518 cycles # 2.882 GHz + 21,587,159,141 instructions # 2.31 insn per cycle + 3.239331570 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164921E-002 Relative difference = 1.0277102294013186e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.291532e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.758064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.758064e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.645509 sec - 8,558,488,971 cycles:u # 3.196 GHz (74.94%) - 49,337,655 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.93%) - 143,499,117 stalled-cycles-backend:u # 1.68% backend cycles idle (74.93%) - 15,863,646,395 instructions:u # 1.85 insn per cycle - # 0.01 stalled cycles per insn (74.91%) - 2.681607729 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1479) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.406271e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.696326e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.696326e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.030114 sec + 8,381,289,955 cycles # 2.761 GHz + 15,943,872,727 instructions # 1.90 insn per cycle + 3.036686774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165093E-002 Relative difference = 1.0277088906338675e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.611770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.211566e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.211566e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.823652 sec + 7,834,743,570 cycles # 2.770 GHz + 15,370,444,400 instructions # 1.96 insn per cycle + 2.830226684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.110110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.044152e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.044152e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.399029 sec + 7,342,854,469 cycles # 2.157 GHz + 13,880,932,107 instructions # 1.89 insn per cycle + 3.405583219 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039868165093E-002 +Relative difference = 1.0277088906338675e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 9d85c8125b..30054d0a8f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_13:49:03 +DATE: 2024-01-30_04:52:57 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.822503e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.886194e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.564933e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.540991 sec - 14,964,746,430 cycles:u # 3.275 GHz (75.08%) - 53,367,635 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.02%) - 6,902,312,333 stalled-cycles-backend:u # 46.12% backend cycles idle (74.96%) - 11,491,236,018 instructions:u # 0.77 insn per cycle - # 0.60 stalled cycles per insn (74.95%) - 4.596843421 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.089125e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083340e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291553e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.592260 sec + 2,336,196,912 cycles # 2.833 GHz + 3,633,132,034 instructions # 1.56 insn per cycle + 0.902800684 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.418440e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645579e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645579e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.112835 sec - 17,308,897,976 cycles:u # 3.368 GHz (74.94%) - 39,897,530 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.95%) - 35,730,790 stalled-cycles-backend:u # 0.21% backend cycles idle (74.95%) - 47,288,452,597 instructions:u # 2.73 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 5.142750348 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.035118e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.220346e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220346e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.458158 sec + 18,623,778,658 cycles # 2.882 GHz + 47,047,597,520 instructions # 2.53 insn per cycle + 6.468376899 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.920622e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.133455e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.133455e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.831499 sec - 9,329,735,116 cycles:u # 3.263 GHz (74.87%) - 41,446,131 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.84%) - 634,040,066 stalled-cycles-backend:u # 6.80% backend cycles idle (74.96%) - 22,145,927,626 instructions:u # 2.37 insn per cycle - # 0.03 stalled cycles per insn (75.09%) - 2.863019748 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.220597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.402817e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.402817e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.207438 sec + 9,259,856,985 cycles # 2.882 GHz + 22,093,069,841 instructions # 2.39 insn per cycle + 3.223491423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.417312e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.003968e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.003968e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.517278 sec - 8,221,686,635 cycles:u # 3.231 GHz (74.73%) - 42,237,080 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.72%) - 1,468,343,547 stalled-cycles-backend:u # 17.86% backend cycles idle (75.01%) - 15,538,735,402 instructions:u # 1.89 insn per cycle - # 0.09 stalled cycles per insn (75.17%) - 2.548646629 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.440699e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.781387e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781387e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.957121 sec + 8,193,990,799 cycles # 2.766 GHz + 15,625,791,555 instructions # 1.91 insn per cycle + 2.973833384 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.532783e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.026282e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.026282e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.864857 sec + 7,877,312,491 cycles # 2.746 GHz + 15,298,553,606 instructions # 1.94 insn per cycle + 2.880238416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.515538e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.925634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.925634e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.878384 sec + 6,411,016,127 cycles # 2.223 GHz + 12,624,518,195 instructions # 1.97 insn per cycle + 2.897065980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 1e7b5259fb..cb0960cef7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:39:01 +DATE: 2024-01-30_05:46:05 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.591289e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.292543e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.292543e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.354674 sec - 17,801,711,429 cycles:u # 3.305 GHz (74.96%) - 119,136,937 stalled-cycles-frontend:u # 0.67% frontend cycles idle (74.96%) - 6,965,849,221 stalled-cycles-backend:u # 39.13% backend cycles idle (74.94%) - 17,030,804,835 instructions:u # 0.96 insn per cycle - # 0.41 stalled cycles per insn (74.98%) - 5.411875647 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.896245e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.389243e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.734031 sec + 5,668,072,364 cycles # 2.868 GHz + 10,146,395,921 instructions # 1.79 insn per cycle + 2.033339529 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.406552e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.630049e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.630049e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.208125 sec - 17,519,950,376 cycles:u # 3.343 GHz (74.98%) - 39,726,529 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.91%) - 62,585,991 stalled-cycles-backend:u # 0.36% backend cycles idle (74.90%) - 47,398,710,400 instructions:u # 2.71 insn per cycle - # 0.00 stalled cycles per insn (74.97%) - 5.243207445 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.023723e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.199962e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.631109 sec + 19,198,970,802 cycles # 2.893 GHz + 47,195,604,267 instructions # 2.46 insn per cycle + 6.638520301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.841478e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.976405e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.976405e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.979950 sec - 9,712,189,466 cycles:u # 3.223 GHz (75.00%) - 42,454,538 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.05%) - 673,634,001 stalled-cycles-backend:u # 6.94% backend cycles idle (75.05%) - 23,422,171,384 instructions:u # 2.41 insn per cycle - # 0.03 stalled cycles per insn (75.05%) - 3.016536490 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.130711e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.183569e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.183569e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.452422 sec + 9,989,387,225 cycles # 2.889 GHz + 23,431,077,272 instructions # 2.35 insn per cycle + 3.459894158 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.323002e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.808846e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.808846e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.650715 sec - 8,541,815,755 cycles:u # 3.182 GHz (75.00%) - 42,861,880 stalled-cycles-frontend:u # 0.50% frontend cycles idle (74.97%) - 1,469,731,134 stalled-cycles-backend:u # 17.21% backend cycles idle (74.99%) - 16,642,700,791 instructions:u # 1.95 insn per cycle - # 0.09 stalled cycles per insn (74.99%) - 2.687678893 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.341081e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.547294e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.547294e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.196012 sec + 8,906,176,925 cycles # 2.782 GHz + 16,751,991,837 instructions # 1.88 insn per cycle + 3.203321936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.434021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786427e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786427e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.093664 sec + 8,635,370,178 cycles # 2.786 GHz + 16,424,138,356 instructions # 1.90 insn per cycle + 3.101132741 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.383314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611676e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611676e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.145258 sec + 7,151,980,153 cycles # 2.270 GHz + 13,850,467,115 instructions # 1.94 insn per cycle + 3.152590479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index f932e39c83..26c818590d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:52:53 +DATE: 2024-01-30_05:59:57 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.826922e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.880208e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.555099e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.540773 sec - 14,945,912,811 cycles:u # 3.273 GHz (75.06%) - 53,537,566 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.99%) - 14,415,846 stalled-cycles-backend:u # 0.10% backend cycles idle (74.90%) - 11,535,467,523 instructions:u # 0.77 insn per cycle - # 0.00 stalled cycles per insn (74.91%) - 4.590921512 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.303596e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175288e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.243996e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.207260 sec + 4,082,591,214 cycles # 2.858 GHz + 6,515,356,659 instructions # 1.60 insn per cycle + 1.486873600 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.416115e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645493e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645493e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.039099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222240e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222240e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.118619 sec - 17,328,202,172 cycles:u # 3.368 GHz (74.97%) - 39,972,803 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) - 37,213,488 stalled-cycles-backend:u # 0.21% backend cycles idle (74.97%) - 47,246,133,917 instructions:u # 2.73 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 5.148163373 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.777770 sec + 19,569,392,860 cycles # 2.885 GHz + 47,229,099,277 instructions # 2.41 insn per cycle + 6.784024049 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.922683e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.133650e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.133650e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.224011e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.394362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.394362e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.826629 sec - 9,320,790,539 cycles:u # 3.266 GHz (74.87%) - 41,188,075 stalled-cycles-frontend:u # 0.44% frontend cycles idle (75.01%) - 645,773,111 stalled-cycles-backend:u # 6.93% backend cycles idle (75.05%) - 22,128,687,314 instructions:u # 2.37 insn per cycle - # 0.03 stalled cycles per insn (75.06%) - 2.856154656 seconds time elapsed +TOTAL : 3.543713 sec + 10,250,573,649 cycles # 2.890 GHz + 22,173,775,935 instructions # 2.16 insn per cycle + 3.550219999 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.412141e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.996034e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.996034e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.458663e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.813529e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.813529e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.520340 sec - 8,200,553,276 cycles:u # 3.219 GHz (74.88%) - 41,966,268 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.90%) - 1,471,137,468 stalled-cycles-backend:u # 17.94% backend cycles idle (74.90%) - 15,573,562,618 instructions:u # 1.90 insn per cycle - # 0.09 stalled cycles per insn (74.96%) - 2.549956102 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +TOTAL : 3.280080 sec + 9,161,776,432 cycles # 2.789 GHz + 15,536,168,479 instructions # 1.70 insn per cycle + 3.286291256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.554649e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.077981e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.077981e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.189150 sec + 8,891,496,493 cycles # 2.784 GHz + 15,006,164,122 instructions # 1.69 insn per cycle + 3.195486341 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.516232e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.934012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.934012e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 +TOTAL : 3.229540 sec + 7,432,998,054 cycles # 2.298 GHz + 12,333,053,960 instructions # 1.66 insn per cycle + 3.235962697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 1c74b1aeb4..90d7f62db4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:50:16 +DATE: 2024-01-30_05:56:29 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 51,917,821 cycles:u # 2.385 GHz (63.27%) - 38,761 stalled-cycles-frontend:u # 0.07% frontend cycles idle (63.27%) - 606,072 stalled-cycles-backend:u # 1.17% backend cycles idle (63.27%) - 42,794,904 instructions:u # 0.82 insn per cycle - # 0.01 stalled cycles per insn (65.21%) - 0.022695324 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.305141e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181296e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274552e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.867173 sec + 3,085,877,327 cycles # 2.830 GHz + 6,333,420,740 instructions # 2.05 insn per cycle + 1.147827940 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted - 56,020,284 cycles:u # 2.606 GHz (62.82%) - 42,671 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.82%) - 614,725 stalled-cycles-backend:u # 1.10% backend cycles idle (62.82%) - 39,130,829 instructions:u # 0.70 insn per cycle - # 0.02 stalled cycles per insn (64.46%) - 0.022799927 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.039832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222763e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.429819 sec + 18,561,263,651 cycles # 2.885 GHz + 47,048,334,209 instructions # 2.53 insn per cycle + 6.436326918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted - 52,602,539 cycles:u # 2.430 GHz (63.07%) - 44,114 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.07%) - 591,873 stalled-cycles-backend:u # 1.13% backend cycles idle (63.07%) - 41,944,859 instructions:u # 0.80 insn per cycle - # 0.01 stalled cycles per insn (64.69%) - 0.022939957 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.222730e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.393980e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.393980e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.203768 sec + 9,238,443,218 cycles # 2.879 GHz + 22,092,244,938 instructions # 2.39 insn per cycle + 3.210105048 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted - 56,603,806 cycles:u # 2.608 GHz (63.17%) - 34,889 stalled-cycles-frontend:u # 0.06% frontend cycles idle (63.17%) - 570,698 stalled-cycles-backend:u # 1.01% backend cycles idle (63.17%) - 42,471,825 instructions:u # 0.75 insn per cycle - # 0.01 stalled cycles per insn (58.35%) - 0.023997783 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.418509e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.733909e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733909e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.982066 sec + 8,185,679,734 cycles # 2.740 GHz + 15,625,107,028 instructions # 1.91 insn per cycle + 2.988278371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.558846e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085053e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.831486 sec + 7,894,514,850 cycles # 2.783 GHz + 15,296,644,493 instructions # 1.94 insn per cycle + 2.837958999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.942507e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.942507e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.867904 sec + 6,407,267,092 cycles # 2.230 GHz + 12,623,570,741 instructions # 1.97 insn per cycle + 2.874115235 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index dd80ca1417..91671fa84d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:46:48 +DATE: 2024-01-30_05:53:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.353204e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.699417e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.359789e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371886e-02 +- 3.270260e-06 ) GeV^0 -TOTAL : 5.254917 sec - 17,475,736,169 cycles:u # 3.306 GHz (75.03%) - 118,130,291 stalled-cycles-frontend:u # 0.68% frontend cycles idle (75.05%) - 6,884,676,285 stalled-cycles-backend:u # 39.40% backend cycles idle (75.05%) - 16,715,227,456 instructions:u # 0.96 insn per cycle - # 0.41 stalled cycles per insn (75.04%) - 5.304271267 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.674927e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142204e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.126513e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.522177 sec + 5,014,296,377 cycles # 2.858 GHz + 9,135,258,914 instructions # 1.82 insn per cycle + 1.813578794 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.415589e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645535e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645535e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 5.119285 sec - 17,331,666,900 cycles:u # 3.368 GHz (74.97%) - 40,669,852 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.97%) - 36,759,472 stalled-cycles-backend:u # 0.21% backend cycles idle (74.97%) - 47,239,596,291 instructions:u # 2.73 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 5.148875716 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 541) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.043183e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.226572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.226572e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.408092 sec + 18,567,709,150 cycles # 2.896 GHz + 47,047,255,730 instructions # 2.53 insn per cycle + 6.414419955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.925021e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.135088e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.135088e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.827945 sec - 9,319,315,337 cycles:u # 3.264 GHz (74.84%) - 41,387,005 stalled-cycles-frontend:u # 0.44% frontend cycles idle (74.98%) - 643,492,388 stalled-cycles-backend:u # 6.90% backend cycles idle (75.06%) - 22,144,122,734 instructions:u # 2.38 insn per cycle - # 0.03 stalled cycles per insn (75.07%) - 2.857459790 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.231919e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.414648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.414648e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.191678 sec + 9,246,166,536 cycles # 2.894 GHz + 22,093,449,321 instructions # 2.39 insn per cycle + 3.197919261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.418028e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.008897e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.008897e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.515058 sec - 8,209,426,221 cycles:u # 3.230 GHz (74.85%) - 42,192,838 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.85%) - 1,461,738,597 stalled-cycles-backend:u # 17.81% backend cycles idle (74.92%) - 15,535,734,341 instructions:u # 1.89 insn per cycle - # 0.09 stalled cycles per insn (75.08%) - 2.544365775 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2601) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.455778e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806689e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806689e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.938294 sec + 8,179,243,825 cycles # 2.779 GHz + 15,624,915,954 instructions # 1.91 insn per cycle + 2.944456642 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.562111e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.082808e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.082808e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.828979 sec + 7,880,998,863 cycles # 2.781 GHz + 15,296,291,599 instructions # 1.94 insn per cycle + 2.835269816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.528595e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.951135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.951135e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.864434 sec + 6,402,503,393 cycles # 2.232 GHz + 12,623,594,501 instructions # 1.97 insn per cycle + 2.870718249 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index e59c139d2f..cc5700bb60 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_13:49:31 +DATE: 2024-01-30_04:53:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.837530e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.912256e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.600050e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.542847 sec - 14,980,531,541 cycles:u # 3.276 GHz (75.06%) - 53,460,149 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.98%) - 6,912,146,752 stalled-cycles-backend:u # 46.14% backend cycles idle (74.93%) - 11,476,019,042 instructions:u # 0.77 insn per cycle - # 0.60 stalled cycles per insn (74.92%) - 4.596974255 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.091291e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093645e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338052e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.585723 sec + 2,310,991,948 cycles # 2.835 GHz + 3,567,792,024 instructions # 1.54 insn per cycle + 0.889438316 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.541888e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.813728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.813728e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.755661 sec - 16,087,215,170 cycles:u # 3.364 GHz (74.84%) - 39,830,326 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.85%) - 34,401,704 stalled-cycles-backend:u # 0.21% backend cycles idle (75.00%) - 44,045,394,048 instructions:u # 2.74 insn per cycle - # 0.00 stalled cycles per insn (75.08%) - 4.785452751 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 466) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.092050e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.295990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.295990e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.138113 sec + 17,749,278,373 cycles # 2.890 GHz + 43,890,075,557 instructions # 2.47 insn per cycle + 6.149965364 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.021652e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.329118e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.329118e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.759658 sec - 9,064,951,892 cycles:u # 3.252 GHz (75.04%) - 42,419,061 stalled-cycles-frontend:u # 0.47% frontend cycles idle (75.03%) - 124,707,018 stalled-cycles-backend:u # 1.38% backend cycles idle (75.03%) - 21,624,549,669 instructions:u # 2.39 insn per cycle - # 0.01 stalled cycles per insn (75.03%) - 2.790596878 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.281832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528866e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.131918 sec + 9,063,997,030 cycles # 2.890 GHz + 21,583,444,087 instructions # 2.38 insn per cycle + 3.172631085 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.468848e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.115626e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.115626e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.491231 sec - 8,118,328,833 cycles:u # 3.223 GHz (74.94%) - 42,223,386 stalled-cycles-frontend:u # 0.52% frontend cycles idle (74.91%) - 1,786,726,359 stalled-cycles-backend:u # 22.01% backend cycles idle (74.78%) - 15,402,271,138 instructions:u # 1.90 insn per cycle - # 0.12 stalled cycles per insn (74.80%) - 2.522582463 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2524) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.471429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.850830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850830e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.922404 sec + 8,130,490,307 cycles # 2.776 GHz + 15,429,884,484 instructions # 1.90 insn per cycle + 2.941222784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053369958070E-002 -Relative difference = 2.627022867500074e-07 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.565898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093653e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093653e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.826189 sec + 7,861,694,964 cycles # 2.776 GHz + 15,087,354,653 instructions # 1.92 insn per cycle + 2.844638276 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053255361738E-002 +Relative difference = 2.5376902468575066e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.637184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.244046e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.244046e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.766988 sec + 6,178,543,208 cycles # 2.228 GHz + 12,245,131,195 instructions # 1.98 insn per cycle + 2.787936795 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index de78a5beb5..df038945e7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:20:08 +DATE: 2024-01-30_05:34:57 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.804352e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.871683e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.544845e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.541034 sec - 15,015,745,431 cycles:u # 3.287 GHz (74.88%) - 53,650,626 stalled-cycles-frontend:u # 0.36% frontend cycles idle (74.87%) - 7,018,832,750 stalled-cycles-backend:u # 46.74% backend cycles idle (74.79%) - 11,152,740,261 instructions:u # 0.74 insn per cycle - # 0.63 stalled cycles per insn (74.94%) - 4.596556877 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.293279e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189438e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292426e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.574531 sec + 2,278,103,742 cycles # 2.838 GHz + 3,559,192,155 instructions # 1.56 insn per cycle + 0.862169679 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.929309e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.375070e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.375070e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.936701 sec - 13,183,554,443 cycles:u # 3.326 GHz (74.97%) - 39,532,639 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.97%) - 1,219,014,067 stalled-cycles-backend:u # 9.25% backend cycles idle (74.98%) - 38,014,245,383 instructions:u # 2.88 insn per cycle - # 0.03 stalled cycles per insn (74.99%) - 3.966500451 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.401205e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.755017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.755017e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.863284 sec + 13,757,936,316 cycles # 2.826 GHz + 37,850,126,745 instructions # 2.75 insn per cycle + 4.870249581 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039543819614E-002 -Relative difference = 3.5561191488957804e-08 +Avg ME (F77/C++) = 1.2828039414671366E-002 +Relative difference = 4.562884388571957e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.453671e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.336262e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.336262e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.495200 sec - 8,136,309,828 cycles:u # 3.225 GHz (74.97%) - 41,422,914 stalled-cycles-frontend:u # 0.51% frontend cycles idle (74.95%) - 225,227,499 stalled-cycles-backend:u # 2.77% backend cycles idle (74.95%) - 18,686,478,479 instructions:u # 2.30 insn per cycle - # 0.01 stalled cycles per insn (74.97%) - 2.526474400 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.651233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.514070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.514070e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.748031 sec + 7,929,384,882 cycles # 2.881 GHz + 18,604,713,730 instructions # 2.35 insn per cycle + 2.754502860 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.861407e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.019102e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.019102e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.304154 sec - 7,466,062,192 cycles:u # 3.202 GHz (74.99%) - 43,746,325 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.96%) - 1,067,727,519 stalled-cycles-backend:u # 14.30% backend cycles idle (74.96%) - 14,266,963,445 instructions:u # 1.91 insn per cycle - # 0.07 stalled cycles per insn (74.81%) - 2.335449030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2233) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.730630e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.541231e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.541231e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.679636 sec + 7,420,774,430 cycles # 2.764 GHz + 14,339,383,869 instructions # 1.93 insn per cycle + 2.686088553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053337216261E-002 -Relative difference = 2.601499261602198e-07 +Avg ME (F77/C++) = 1.2828053246266791E-002 +Relative difference = 2.5306003563303186e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.796396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.739468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.739468e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.625810 sec + 7,304,334,176 cycles # 2.778 GHz + 13,955,275,285 instructions # 1.91 insn per cycle + 2.632447793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.601296e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.146430e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146430e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.796781 sec + 6,273,154,150 cycles # 2.239 GHz + 13,210,323,797 instructions # 2.11 insn per cycle + 2.803318258 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052540498902E-002 +Relative difference = 1.980424851420537e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 95bb38adb1..784101060d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_14:20:35 +DATE: 2024-01-30_05:35:26 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.836133e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.902950e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587351e+08 ) sec^-1 -MeanMatrixElemValue = ( 1.371895e-02 +- 3.272985e-06 ) GeV^0 -TOTAL : 4.536960 sec - 14,978,317,252 cycles:u # 3.280 GHz (75.01%) - 53,404,800 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.03%) - 6,945,859,333 stalled-cycles-backend:u # 46.37% backend cycles idle (74.97%) - 11,211,979,977 instructions:u # 0.75 insn per cycle - # 0.62 stalled cycles per insn (74.95%) - 4.593705124 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.300997e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192378e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323768e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.574497 sec + 2,274,789,999 cycles # 2.831 GHz + 3,565,149,005 instructions # 1.57 insn per cycle + 0.863293975 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282802e-02 -Avg ME (F77/CUDA) = 1.2828036033170065E-002 -Relative difference = 1.2498553996774023e-06 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.677087e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.618912e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.618912e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.026740 sec - 9,998,889,028 cycles:u # 3.274 GHz (74.87%) - 38,455,022 stalled-cycles-frontend:u # 0.38% frontend cycles idle (74.90%) - 29,286,585 stalled-cycles-backend:u # 0.29% backend cycles idle (75.03%) - 28,571,948,254 instructions:u # 2.86 insn per cycle - # 0.00 stalled cycles per insn (75.12%) - 3.056524855 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.974769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.758467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.758467e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.560083 sec + 10,128,258,424 cycles # 2.841 GHz + 28,399,859,483 instructions # 2.80 insn per cycle + 3.566485849 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039569285465E-002 -Relative difference = 3.357602059382168e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.843963e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.258267e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.258267e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.313249 sec - 7,469,142,741 cycles:u # 3.191 GHz (75.00%) - 40,231,675 stalled-cycles-frontend:u # 0.54% frontend cycles idle (75.05%) - 31,915,046 stalled-cycles-backend:u # 0.43% backend cycles idle (75.05%) - 16,932,716,531 instructions:u # 2.27 insn per cycle - # 0.00 stalled cycles per insn (75.06%) - 2.344830688 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.921662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.360866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.360866e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.529327 sec + 7,292,501,410 cycles # 2.880 GHz + 16,787,289,445 instructions # 2.30 insn per cycle + 2.535811154 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039385567536E-002 -Relative difference = 4.7897610623017996e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.056970e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.511923e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.511923e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.225961 sec - 7,192,051,533 cycles:u # 3.191 GHz (74.90%) - 42,074,082 stalled-cycles-frontend:u # 0.59% frontend cycles idle (74.82%) - 370,974,550 stalled-cycles-backend:u # 5.16% backend cycles idle (74.85%) - 13,657,358,681 instructions:u # 1.90 insn per cycle - # 0.03 stalled cycles per insn (75.03%) - 2.258560302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2064) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.902980e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.008268e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.008268e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.546290 sec + 7,099,294,688 cycles # 2.783 GHz + 13,729,465,706 instructions # 1.93 insn per cycle + 2.552602290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053331759293E-002 -Relative difference = 2.597245327285885e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.894124e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.023412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023412e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.549860 sec + 7,037,352,059 cycles # 2.755 GHz + 13,462,222,302 instructions # 1.91 insn per cycle + 2.556338558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.741921e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.505340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.505340e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.671598 sec + 6,046,764,080 cycles # 2.259 GHz + 12,911,501,907 instructions # 2.14 insn per cycle + 2.677952936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282805e-02 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index b83f428d97..7a09642823 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_13:49:58 +DATE: 2024-01-30_04:54:00 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.293135e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.114268e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.342215e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.657555 sec - 15,365,174,038 cycles:u # 3.275 GHz (74.94%) - 53,634,970 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.94%) - 6,932,864,082 stalled-cycles-backend:u # 45.12% backend cycles idle (75.01%) - 11,485,086,301 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (75.10%) - 4.716089042 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.434258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.171049e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.704261 sec + 2,701,570,097 cycles # 2.831 GHz + 4,244,340,283 instructions # 1.57 insn per cycle + 1.033944641 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590281E-002 -Relative difference = 7.67145406542181e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.242473e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418692e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418692e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.800763 sec - 19,618,512,463 cycles:u # 3.363 GHz (74.95%) - 48,717,247 stalled-cycles-frontend:u # 0.25% frontend cycles idle (75.02%) - 149,768,230 stalled-cycles-backend:u # 0.76% backend cycles idle (75.04%) - 47,103,320,911 instructions:u # 2.40 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 5.835627988 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 473) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.829628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139787e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.139787e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.819159 sec + 19,690,827,956 cycles # 2.885 GHz + 46,971,779,576 instructions # 2.39 insn per cycle + 6.832663552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.978746e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.509635e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.509635e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.916740 sec - 13,061,630,358 cycles:u # 3.308 GHz (74.88%) - 53,677,065 stalled-cycles-frontend:u # 0.41% frontend cycles idle (74.92%) - 2,198,728,312 stalled-cycles-backend:u # 16.83% backend cycles idle (75.02%) - 30,732,838,719 instructions:u # 2.35 insn per cycle - # 0.07 stalled cycles per insn (75.09%) - 3.952822018 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.605344e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.116934e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.116934e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.334479 sec + 12,518,471,325 cycles # 2.884 GHz + 30,922,888,427 instructions # 2.47 insn per cycle + 4.354467708 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.584854e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.407963e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.407963e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.170189 sec - 10,407,251,362 cycles:u # 3.250 GHz (74.95%) - 50,130,427 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.02%) - 903,788,182 stalled-cycles-backend:u # 8.68% backend cycles idle (75.02%) - 19,380,592,287 instructions:u # 1.86 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 3.206228498 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2101) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.917239e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660472e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.702387 sec + 10,174,876,030 cycles # 2.745 GHz + 19,548,406,942 instructions # 1.92 insn per cycle + 3.720275920 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.029293e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.888276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.888276e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.515786 sec + 9,723,051,646 cycles # 2.761 GHz + 18,859,468,530 instructions # 1.94 insn per cycle + 3.531121351 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.839848e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.512898e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.512898e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.838759 sec + 8,110,381,366 cycles # 2.110 GHz + 14,814,382,883 instructions # 1.83 insn per cycle + 3.856049832 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index d77c48d5b0..385e9ed225 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-01-31_13:50:29 +DATE: 2024-01-30_04:54:36 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.862136e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.572230e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.892440e+07 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.648744 sec - 15,325,650,566 cycles:u # 3.273 GHz (74.92%) - 53,884,227 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.97%) - 6,927,612,166 stalled-cycles-backend:u # 45.20% backend cycles idle (75.07%) - 11,502,059,340 instructions:u # 0.75 insn per cycle - # 0.60 stalled cycles per insn (75.06%) - 4.708036945 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.428632e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.291557e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.197877e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.704173 sec + 2,700,513,236 cycles # 2.833 GHz + 4,160,757,344 instructions # 1.54 insn per cycle + 1.040080983 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.282804e-02 -Avg ME (F77/CUDA) = 1.2828039901590284E-002 -Relative difference = 7.67145379496374e-09 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.312263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.512184e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.512184e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.529721 sec - 18,672,385,681 cycles:u # 3.358 GHz (74.97%) - 51,499,925 stalled-cycles-frontend:u # 0.28% frontend cycles idle (74.97%) - 50,915,058 stalled-cycles-backend:u # 0.27% backend cycles idle (74.98%) - 44,718,089,446 instructions:u # 2.39 insn per cycle - # 0.00 stalled cycles per insn (74.98%) - 5.564148944 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 497) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.048898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.230601e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230601e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.414952 sec + 18,538,807,361 cycles # 2.888 GHz + 44,591,647,960 instructions # 2.41 insn per cycle + 6.426389730 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.023957e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.583519e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.583519e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.842400 sec - 12,751,330,882 cycles:u # 3.291 GHz (74.96%) - 49,158,998 stalled-cycles-frontend:u # 0.39% frontend cycles idle (75.02%) - 1,837,349,313 stalled-cycles-backend:u # 14.41% backend cycles idle (75.02%) - 30,166,434,355 instructions:u # 2.37 insn per cycle - # 0.06 stalled cycles per insn (75.02%) - 3.878606955 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.655305e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204388e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204388e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.214890 sec + 12,207,966,974 cycles # 2.892 GHz + 30,217,340,923 instructions # 2.48 insn per cycle + 4.236133486 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.607314e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.449826e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.449826e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.147113 sec - 10,343,092,922 cycles:u # 3.253 GHz (74.86%) - 50,008,630 stalled-cycles-frontend:u # 0.48% frontend cycles idle (74.98%) - 288,268,733 stalled-cycles-backend:u # 2.79% backend cycles idle (75.09%) - 18,718,552,499 instructions:u # 1.81 insn per cycle - # 0.02 stalled cycles per insn (75.09%) - 3.183147608 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2054) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.899712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.627205e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.627205e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.730288 sec + 10,158,219,608 cycles # 2.719 GHz + 19,037,132,874 instructions # 1.87 insn per cycle + 3.746558078 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.048047e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.931283e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.931283e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.492411 sec + 9,571,391,969 cycles # 2.738 GHz + 18,453,150,608 instructions # 1.93 insn per cycle + 3.509341045 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.170414e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.170487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170487e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.317793 sec + 7,240,072,684 cycles # 2.179 GHz + 13,244,781,040 instructions # 1.83 insn per cycle + 3.341198784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.282804e-02 +Avg ME (F77/C++) = 1.2828039951670679E-002 +Relative difference = 3.767475112924841e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 8fce5dda32..2453732bed 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_13:50:59 +DATE: 2024-01-30_04:55:10 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.775167e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.954588e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.009008e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.065658 sec - 3,204,556,883 cycles:u # 2.916 GHz (75.27%) - 10,676,524 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.27%) - 1,170,364,335 stalled-cycles-backend:u # 36.52% backend cycles idle (75.11%) - 2,993,337,490 instructions:u # 0.93 insn per cycle - # 0.39 stalled cycles per insn (75.05%) - 1.123586308 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.010275e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.133419e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272295e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.538830 sec + 2,187,358,219 cycles # 2.824 GHz + 3,139,905,445 instructions # 1.44 insn per cycle + 0.856073288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.517443e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.582929e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.582929e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.347170 sec - 14,992,597,035 cycles:u # 3.423 GHz (74.98%) - 9,832,695 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.98%) - 755,893,575 stalled-cycles-backend:u # 5.04% backend cycles idle (74.99%) - 38,737,774,126 instructions:u # 2.58 insn per cycle - # 0.02 stalled cycles per insn (74.99%) - 4.382868805 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.073581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.135755e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135755e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.168677 sec + 14,980,961,047 cycles # 2.896 GHz + 38,724,485,120 instructions # 2.58 insn per cycle + 5.178651966 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.486268e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.711661e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.711661e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.519670 sec - 8,575,904,466 cycles:u # 3.359 GHz (74.93%) - 9,207,162 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.95%) - 198,649,396 stalled-cycles-backend:u # 2.32% backend cycles idle (74.95%) - 24,465,619,820 instructions:u # 2.85 insn per cycle - # 0.01 stalled cycles per insn (74.94%) - 2.556908642 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.523460e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.721558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.721558e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090012 sec + 8,952,192,290 cycles # 2.893 GHz + 24,430,503,496 instructions # 2.73 insn per cycle + 3.108451490 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.682012e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.275744e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.275744e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.547318 sec - 5,200,461,740 cycles:u # 3.290 GHz (74.70%) - 9,378,138 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.78%) - 1,061,960,703 stalled-cycles-backend:u # 20.42% backend cycles idle (75.02%) - 11,471,924,648 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.21%) - 1.584729817 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.390626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.850527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.850527e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.056967 sec + 5,535,228,908 cycles # 2.683 GHz + 11,562,552,185 instructions # 2.09 insn per cycle + 2.068379535 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.323214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.965355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.965355e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.769440 sec + 4,825,692,035 cycles # 2.719 GHz + 10,341,008,591 instructions # 2.14 insn per cycle + 1.786949030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.039053e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.289363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.289363e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.707049 sec + 4,944,236,176 cycles # 1.822 GHz + 7,554,838,116 instructions # 1.53 insn per cycle + 2.726854934 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 2ef95ac563..adcfa48462 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:39:30 +DATE: 2024-01-30_05:46:39 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.960794e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.780103e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.780103e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.233984 sec - 3,742,212,800 cycles:u # 2.937 GHz (74.94%) - 21,590,088 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.91%) - 13,609,560 stalled-cycles-backend:u # 0.36% backend cycles idle (74.90%) - 3,915,240,315 instructions:u # 1.05 insn per cycle - # 0.01 stalled cycles per insn (74.83%) - 1.300304990 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.344134e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.848581e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.848581e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.830257 sec + 3,050,711,174 cycles # 2.837 GHz + 4,744,287,151 instructions # 1.56 insn per cycle + 1.134543078 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.507286e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.571820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.571820e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.444725 sec - 15,123,005,974 cycles:u # 3.369 GHz (74.95%) - 9,450,189 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.03%) - 1,114,462,839 stalled-cycles-backend:u # 7.37% backend cycles idle (74.97%) - 38,884,708,432 instructions:u # 2.57 insn per cycle - # 0.03 stalled cycles per insn (74.98%) - 4.491345952 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.051768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112380e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.299158 sec + 15,311,911,023 cycles # 2.886 GHz + 38,783,796,929 instructions # 2.53 insn per cycle + 5.307164517 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.456681e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.678201e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.678201e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.622127 sec - 8,746,750,895 cycles:u # 3.280 GHz (74.80%) - 10,047,928 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.93%) - 222,114,170 stalled-cycles-backend:u # 2.54% backend cycles idle (75.07%) - 24,624,407,451 instructions:u # 2.82 insn per cycle - # 0.01 stalled cycles per insn (75.13%) - 2.670262132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.466739e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.657869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.657869e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.216841 sec + 9,297,524,138 cycles # 2.885 GHz + 24,613,723,387 instructions # 2.65 insn per cycle + 3.224967553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.577155e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.154338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.154338e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.655053 sec - 5,347,323,924 cycles:u # 3.147 GHz (74.85%) - 10,320,638 stalled-cycles-frontend:u # 0.19% frontend cycles idle (74.96%) - 1,075,680,183 stalled-cycles-backend:u # 20.12% backend cycles idle (75.07%) - 11,838,991,239 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.09%) - 1.704077905 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.363369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.815752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.815752e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.149413 sec + 5,860,102,645 cycles # 2.720 GHz + 11,849,599,468 instructions # 2.02 insn per cycle + 2.157292568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.162124e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.773170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.773170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.893501 sec + 5,161,881,245 cycles # 2.717 GHz + 10,626,023,875 instructions # 2.06 insn per cycle + 1.901369932 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.945106e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186618e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.853446 sec + 5,298,812,686 cycles # 1.853 GHz + 7,800,536,018 instructions # 1.47 insn per cycle + 2.861501356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 933d7c92ad..b23b4b948e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:53:21 +DATE: 2024-01-30_06:00:31 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.670915e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.961936e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.016085e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565155e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155605e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269580e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.067979 sec - 3,228,626,800 cycles:u # 2.930 GHz (74.62%) - 10,712,797 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.91%) - 1,139,960,341 stalled-cycles-backend:u # 35.31% backend cycles idle (75.37%) - 2,973,885,711 instructions:u # 0.92 insn per cycle - # 0.38 stalled cycles per insn (75.32%) - 1.123409557 seconds time elapsed +TOTAL : 0.625620 sec + 2,433,553,514 cycles # 2.839 GHz + 3,531,317,325 instructions # 1.45 insn per cycle + 0.914840106 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.514410e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.579489e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.579489e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.073060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.134707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.134707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.352579 sec - 15,017,071,691 cycles:u # 3.424 GHz (75.01%) - 10,104,870 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) - 1,131,250,444 stalled-cycles-backend:u # 7.53% backend cycles idle (75.01%) - 38,711,980,403 instructions:u # 2.58 insn per cycle - # 0.03 stalled cycles per insn (74.93%) - 4.388124543 seconds time elapsed +TOTAL : 5.229961 sec + 15,157,435,498 cycles # 2.896 GHz + 38,739,723,091 instructions # 2.56 insn per cycle + 5.236486145 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.486799e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.712857e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.712857e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.526696e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.723798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723798e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.519538 sec - 8,573,730,911 cycles:u # 3.359 GHz (74.95%) - 9,322,072 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.96%) - 200,778,264 stalled-cycles-backend:u # 2.34% backend cycles idle (74.94%) - 24,461,372,498 instructions:u # 2.85 insn per cycle - # 0.01 stalled cycles per insn (74.93%) - 2.555255997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.143207 sec + 9,122,833,846 cycles # 2.898 GHz + 24,428,638,513 instructions # 2.68 insn per cycle + 3.149727451 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.681424e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.275449e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.275449e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.453278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.923487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.923487e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.547574 sec - 5,204,688,501 cycles:u # 3.294 GHz (74.69%) - 9,388,391 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.70%) - 1,063,036,716 stalled-cycles-backend:u # 20.42% backend cycles idle (74.91%) - 11,504,264,567 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.15%) - 1.582904825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +TOTAL : 2.094012 sec + 5,713,399,327 cycles # 2.721 GHz + 11,544,398,198 instructions # 2.02 insn per cycle + 2.100575275 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.340982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.000324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.000324e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 1.826424 sec + 5,007,819,577 cycles # 2.734 GHz + 10,288,512,439 instructions # 2.05 insn per cycle + 1.833139039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.024689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.274198e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.274198e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 2.778758 sec + 5,115,298,192 cycles # 1.837 GHz + 7,503,411,062 instructions # 1.47 insn per cycle + 2.785395708 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index 3800ac2c9e..66a621d02a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:50:29 +DATE: 2024-01-30_05:57:01 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.578143e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159887e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277521e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.567703 sec + 2,256,406,335 cycles # 2.832 GHz + 3,552,290,336 instructions # 1.57 insn per cycle + 0.856591173 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 51,935,826 cycles:u # 2.391 GHz (63.20%) - 44,099 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.20%) - 647,716 stalled-cycles-backend:u # 1.25% backend cycles idle (63.20%) - 39,610,708 instructions:u # 0.76 insn per cycle - # 0.02 stalled cycles per insn (65.11%) - 0.022647752 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted - 53,696,974 cycles:u # 2.491 GHz (62.92%) - 44,914 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.92%) - 548,464 stalled-cycles-backend:u # 1.02% backend cycles idle (62.92%) - 41,043,423 instructions:u # 0.76 insn per cycle - # 0.01 stalled cycles per insn (64.63%) - 0.022851105 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.061569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.123242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123242e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.196625 sec + 14,980,592,489 cycles # 2.880 GHz + 38,723,298,937 instructions # 2.58 insn per cycle + 5.203366404 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted - 53,124,098 cycles:u # 2.450 GHz (63.14%) - 47,517 stalled-cycles-frontend:u # 0.09% frontend cycles idle (63.14%) - 599,506 stalled-cycles-backend:u # 1.13% backend cycles idle (63.14%) - 41,225,044 instructions:u # 0.78 insn per cycle - # 0.01 stalled cycles per insn (65.15%) - 0.023077322 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.518700e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.715553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.715553e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090850 sec + 8,946,489,145 cycles # 2.890 GHz + 24,429,263,818 instructions # 2.73 insn per cycle + 3.097198356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted - 54,938,080 cycles:u # 2.544 GHz (62.98%) - 43,710 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.98%) - 616,158 stalled-cycles-backend:u # 1.12% backend cycles idle (62.98%) - 39,751,973 instructions:u # 0.72 insn per cycle - # 0.02 stalled cycles per insn (64.78%) - 0.022926913 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.476437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.948509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948509e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.025257 sec + 5,523,468,825 cycles # 2.720 GHz + 11,561,737,650 instructions # 2.09 insn per cycle + 2.031752517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.358069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007551e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.759129 sec + 4,801,841,802 cycles # 2.722 GHz + 10,338,992,386 instructions # 2.15 insn per cycle + 1.765685267 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.036811e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.287808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.287808e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.707972 sec + 4,942,835,417 cycles # 1.822 GHz + 7,554,452,946 instructions # 1.53 insn per cycle + 2.714536601 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 736354f8c0..defb46a739 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:47:17 +DATE: 2024-01-30_05:53:36 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.832332e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.949244e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.003292e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.180540 sec - 3,633,951,217 cycles:u # 2.981 GHz (75.06%) - 21,429,347 stalled-cycles-frontend:u # 0.59% frontend cycles idle (75.09%) - 1,154,914,290 stalled-cycles-backend:u # 31.78% backend cycles idle (75.12%) - 3,842,620,949 instructions:u # 1.06 insn per cycle - # 0.30 stalled cycles per insn (75.09%) - 1.235886270 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.688012e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154108e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269539e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.720498 sec + 2,707,766,521 cycles # 2.848 GHz + 4,278,662,893 instructions # 1.58 insn per cycle + 1.009865256 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.514363e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.579371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.579371e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.352130 sec - 15,020,504,376 cycles:u # 3.425 GHz (74.96%) - 10,169,638 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.01%) - 1,098,660,659 stalled-cycles-backend:u # 7.31% backend cycles idle (75.01%) - 38,671,250,350 instructions:u # 2.57 insn per cycle - # 0.03 stalled cycles per insn (75.02%) - 4.388382110 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.066193e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.127901e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.127901e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.184582 sec + 14,984,631,159 cycles # 2.888 GHz + 38,723,388,390 instructions # 2.58 insn per cycle + 5.191155299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.492938e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.721485e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.721485e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.515981 sec - 8,578,848,652 cycles:u # 3.365 GHz (74.92%) - 9,425,944 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.91%) - 196,079,645 stalled-cycles-backend:u # 2.29% backend cycles idle (74.90%) - 24,441,818,243 instructions:u # 2.85 insn per cycle - # 0.01 stalled cycles per insn (74.97%) - 2.551686085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2071) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.511860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.708377e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.708377e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.095761 sec + 8,950,231,816 cycles # 2.886 GHz + 24,430,052,071 instructions # 2.73 insn per cycle + 3.102564983 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.684584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.279871e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.279871e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.547039 sec - 5,203,617,968 cycles:u # 3.293 GHz (74.69%) - 9,371,257 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.69%) - 1,063,000,473 stalled-cycles-backend:u # 20.43% backend cycles idle (74.93%) - 11,490,848,219 instructions:u # 2.21 insn per cycle - # 0.09 stalled cycles per insn (75.19%) - 1.582473684 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2383) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.454029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.925499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.925499e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.033615 sec + 5,531,582,240 cycles # 2.713 GHz + 11,562,288,179 instructions # 2.09 insn per cycle + 2.040383969 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.327959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.977069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977069e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.768254 sec + 4,816,907,251 cycles # 2.716 GHz + 10,339,308,595 instructions # 2.15 insn per cycle + 1.774968996 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.992436e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.241387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.241387e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.737995 sec + 4,943,973,305 cycles # 1.803 GHz + 7,555,690,658 instructions # 1.53 insn per cycle + 2.744582139 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index eeb8545967..fe6f195aa6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_13:51:21 +DATE: 2024-01-30_04:55:39 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.811456e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.916789e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.969732e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.063480 sec - 3,238,472,233 cycles:u # 2.948 GHz (74.73%) - 10,776,735 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.74%) - 1,146,001,929 stalled-cycles-backend:u # 35.39% backend cycles idle (74.94%) - 2,974,660,974 instructions:u # 0.92 insn per cycle - # 0.39 stalled cycles per insn (74.89%) - 1.119704313 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.125481e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158117e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273663e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.534485 sec + 2,191,778,134 cycles # 2.834 GHz + 3,140,951,827 instructions # 1.43 insn per cycle + 0.850685752 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.435903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.497119e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.497119e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.487080 sec - 15,515,065,184 cycles:u # 3.433 GHz (74.87%) - 10,157,980 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.96%) - 13,998,426 stalled-cycles-backend:u # 0.09% backend cycles idle (75.05%) - 39,485,501,686 instructions:u # 2.54 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 4.522693165 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.109309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.173415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.173415e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.081724 sec + 14,685,294,357 cycles # 2.887 GHz + 39,544,026,748 instructions # 2.69 insn per cycle + 5.093038112 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.393621e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.610679e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.610679e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.568675 sec - 8,779,562,714 cycles:u # 3.375 GHz (74.79%) - 10,583,458 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.94%) - 1,218,655,804 stalled-cycles-backend:u # 13.88% backend cycles idle (75.09%) - 23,482,387,475 instructions:u # 2.67 insn per cycle - # 0.05 stalled cycles per insn (75.10%) - 2.605973588 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1952) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.661768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.875473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.875473e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.973090 sec + 8,600,238,365 cycles # 2.886 GHz + 23,576,508,735 instructions # 2.74 insn per cycle + 2.991032269 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.904717e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.380877e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.380877e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.700922 sec - 5,712,275,197 cycles:u # 3.294 GHz (74.88%) - 9,340,024 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.10%) - 997,171,641 stalled-cycles-backend:u # 17.46% backend cycles idle (75.10%) - 13,125,301,039 instructions:u # 2.30 insn per cycle - # 0.08 stalled cycles per insn (75.10%) - 1.738237090 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2547) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.966204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.352181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.352181e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.222703 sec + 5,964,350,122 cycles # 2.676 GHz + 13,193,903,385 instructions # 2.21 insn per cycle + 2.290428549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.425705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.897406e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.897406e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.043603 sec + 5,539,021,528 cycles # 2.702 GHz + 12,103,311,893 instructions # 2.19 insn per cycle + 2.060365335 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.662802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.870728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.870728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.974749 sec + 5,366,303,915 cycles # 1.800 GHz + 9,381,926,109 instructions # 1.75 insn per cycle + 2.994553633 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 087aae64d6..8cd37966a9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:21:00 +DATE: 2024-01-30_05:35:53 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.761044e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.954770e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.008707e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.068074 sec - 3,269,729,629 cycles:u # 2.969 GHz (75.15%) - 10,826,176 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.86%) - 1,132,096,656 stalled-cycles-backend:u # 34.62% backend cycles idle (74.78%) - 3,039,323,944 instructions:u # 0.93 insn per cycle - # 0.37 stalled cycles per insn (74.62%) - 1.127599788 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.561376e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154966e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270589e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529335 sec + 2,159,762,911 cycles # 2.829 GHz + 3,107,803,545 instructions # 1.44 insn per cycle + 0.822533200 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.851563e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.936112e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.936112e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.858759 sec - 13,289,116,499 cycles:u # 3.415 GHz (74.94%) - 9,877,100 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.94%) - 561,229,808 stalled-cycles-backend:u # 4.22% backend cycles idle (74.92%) - 35,881,093,632 instructions:u # 2.70 insn per cycle - # 0.02 stalled cycles per insn (74.93%) - 3.894497837 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.227004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298943e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298943e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.818572 sec + 13,907,927,893 cycles # 2.883 GHz + 35,849,684,316 instructions # 2.58 insn per cycle + 4.825096940 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.420765e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.642646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.642646e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.552803 sec - 8,706,839,853 cycles:u # 3.367 GHz (74.99%) - 10,160,636 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.95%) - 2,349,532,908 stalled-cycles-backend:u # 26.98% backend cycles idle (74.97%) - 21,909,575,708 instructions:u # 2.52 insn per cycle - # 0.11 stalled cycles per insn (74.96%) - 2.590046894 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.848483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.087109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.087109e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.835129 sec + 8,213,185,511 cycles # 2.892 GHz + 21,908,282,308 instructions # 2.67 insn per cycle + 2.841971377 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.666808e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.109706e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.109706e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.755370 sec - 5,890,439,803 cycles:u # 3.293 GHz (74.96%) - 9,331,242 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.97%) - 2,249,642,175 stalled-cycles-backend:u # 38.19% backend cycles idle (75.00%) - 12,103,314,118 instructions:u # 2.05 insn per cycle - # 0.19 stalled cycles per insn (74.99%) - 1.792746465 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3046) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.473983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.948336e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948336e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.025782 sec + 5,530,364,572 cycles # 2.723 GHz + 12,076,349,288 instructions # 2.18 insn per cycle + 2.032542267 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.936500e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.499652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.499652e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.876359 sec + 5,112,015,535 cycles # 2.716 GHz + 11,141,551,976 instructions # 2.18 insn per cycle + 1.883163972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.149105e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.416003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.416003e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.637284 sec + 4,829,728,502 cycles # 1.827 GHz + 8,842,382,666 instructions # 1.83 insn per cycle + 2.644418009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index b62288ac7e..8eec31c0d3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:21:22 +DATE: 2024-01-30_05:36:21 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.783075e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.914487e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.967342e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.064985 sec - 3,240,436,157 cycles:u # 2.942 GHz (74.55%) - 10,749,114 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.20%) - 1,139,603,683 stalled-cycles-backend:u # 35.17% backend cycles idle (75.38%) - 3,002,755,196 instructions:u # 0.93 insn per cycle - # 0.38 stalled cycles per insn (75.37%) - 1.124308464 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.565410e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157958e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274503e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.528794 sec + 2,178,979,969 cycles # 2.840 GHz + 3,111,172,536 instructions # 1.43 insn per cycle + 0.825662442 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063388516817 -Relative difference = 3.258803416564443e-07 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.211326e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318376e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318376e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.447382 sec - 11,836,561,603 cycles:u # 3.401 GHz (74.95%) - 9,684,924 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) - 23,063,598 stalled-cycles-backend:u # 0.19% backend cycles idle (74.97%) - 35,762,758,230 instructions:u # 3.02 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 3.483054665 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.483554e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.573797e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573797e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.331634 sec + 12,513,147,299 cycles # 2.885 GHz + 35,729,824,625 instructions # 2.86 insn per cycle + 4.338115382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.808666e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.070704e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.070704e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.362709 sec - 8,036,718,287 cycles:u # 3.354 GHz (75.01%) - 10,456,947 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.96%) - 1,748,701,910 stalled-cycles-backend:u # 21.76% backend cycles idle (74.98%) - 21,246,867,161 instructions:u # 2.64 insn per cycle - # 0.08 stalled cycles per insn (74.99%) - 2.400320396 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.944859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.193242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.193242e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.766913 sec + 8,026,265,535 cycles # 2.895 GHz + 21,260,291,484 instructions # 2.65 insn per cycle + 2.773559046 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.914365e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.546212e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.546212e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.507943 sec - 5,027,160,241 cycles:u # 3.261 GHz (74.92%) - 9,342,894 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.09%) - 303,402,377 stalled-cycles-backend:u # 6.04% backend cycles idle (75.09%) - 11,346,307,110 instructions:u # 2.26 insn per cycle - # 0.03 stalled cycles per insn (75.10%) - 1.545294997 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2354) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.719292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.240372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.240372e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.943097 sec + 5,300,809,350 cycles # 2.722 GHz + 11,405,959,044 instructions # 2.15 insn per cycle + 1.950186269 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.116224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.720108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.720108e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.828206 sec + 4,977,318,735 cycles # 2.718 GHz + 10,599,506,112 instructions # 2.13 insn per cycle + 1.834822870 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.275159e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.556705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.556705e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.563497 sec + 4,703,376,134 cycles # 1.831 GHz + 8,567,908,292 instructions # 1.82 insn per cycle + 2.570320519 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 9d33924327..03334a40e8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_13:51:44 +DATE: 2024-01-30_04:56:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.865799e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.872508e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.028078e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.013471 sec - 3,120,326,875 cycles:u # 2.992 GHz (74.66%) - 10,877,225 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.66%) - 1,161,438,597 stalled-cycles-backend:u # 37.22% backend cycles idle (75.03%) - 2,808,430,330 instructions:u # 0.90 insn per cycle - # 0.41 stalled cycles per insn (75.40%) - 1.068198918 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.266078e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.583524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962786e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.486964 sec + 2,022,378,491 cycles # 2.826 GHz + 2,872,554,108 instructions # 1.42 insn per cycle + 0.794836465 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.977157e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.066362e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.066362e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.666908 sec - 12,716,669,353 cycles:u # 3.442 GHz (74.90%) - 7,108,813 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.90%) - 12,449,971 stalled-cycles-backend:u # 0.10% backend cycles idle (74.92%) - 37,088,931,898 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 3.696733758 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.220233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.293728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.812282 sec + 13,901,639,181 cycles # 2.885 GHz + 37,078,732,469 instructions # 2.67 insn per cycle + 4.824222975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.083038e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.487057e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.487057e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.868298 sec - 6,412,469,462 cycles:u # 3.381 GHz (74.90%) - 7,184,992 stalled-cycles-frontend:u # 0.11% frontend cycles idle (75.06%) - 2,239,939,343 stalled-cycles-backend:u # 34.93% backend cycles idle (75.12%) - 15,206,507,608 instructions:u # 2.37 insn per cycle - # 0.15 stalled cycles per insn (75.12%) - 1.899876170 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.150516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.595808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.595808e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.124737 sec + 6,168,101,005 cycles # 2.895 GHz + 15,212,489,109 instructions # 2.47 insn per cycle + 2.142108549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221758e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.379085e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.379085e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.002858 sec - 3,400,909,569 cycles:u # 3.299 GHz (74.55%) - 8,045,324 stalled-cycles-frontend:u # 0.24% frontend cycles idle (74.72%) - 917,395,280 stalled-cycles-backend:u # 26.97% backend cycles idle (75.10%) - 7,672,145,231 instructions:u # 2.26 insn per cycle - # 0.12 stalled cycles per insn (75.17%) - 1.034197542 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.954385e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029179e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.259990 sec + 3,437,290,204 cycles # 2.715 GHz + 7,715,643,345 instructions # 2.24 insn per cycle + 1.287994689 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.805420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.144112e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144112e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.162105 sec + 3,179,163,625 cycles # 2.727 GHz + 7,109,925,739 instructions # 2.24 insn per cycle + 1.178171652 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.071814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.862424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.862424e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.572617 sec + 2,980,157,633 cycles # 1.888 GHz + 5,763,820,562 instructions # 1.93 insn per cycle + 1.590552097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 6339bf0352..3a80a864ae 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,170 +1,222 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:39:53 +DATE: 2024-01-30_05:47:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.452616e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.045320e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.045320e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.157777 sec - 3,529,050,057 cycles:u # 2.961 GHz (75.18%) - 21,090,755 stalled-cycles-frontend:u # 0.60% frontend cycles idle (75.23%) - 581,377,578 stalled-cycles-backend:u # 16.47% backend cycles idle (75.21%) - 3,877,066,359 instructions:u # 1.10 insn per cycle - # 0.15 stalled cycles per insn (74.93%) - 1.216337024 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.753522e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.358863e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.358863e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.684539 sec + 2,591,525,623 cycles # 2.839 GHz + 3,989,244,311 instructions # 1.54 insn per cycle + 0.972564077 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.976508e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.066016e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.066016e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.708112 sec - 12,739,588,572 cycles:u # 3.405 GHz (74.99%) - 7,663,639 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.99%) - 22,130,215 stalled-cycles-backend:u # 0.17% backend cycles idle (75.00%) - 37,075,008,796 instructions:u # 2.91 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 3.743899139 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.212668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.285744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.285744e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.870798 sec + 14,070,285,227 cycles # 2.885 GHz + 37,122,197,019 instructions # 2.64 insn per cycle + 4.878379515 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.249587e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.668513e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.668513e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.869596 sec - 6,317,353,993 cycles:u # 3.319 GHz (74.87%) - 7,706,055 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.79%) - 2,155,272,535 stalled-cycles-backend:u # 34.12% backend cycles idle (74.79%) - 15,501,273,701 instructions:u # 2.45 insn per cycle - # 0.14 stalled cycles per insn (74.98%) - 1.906860784 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.080420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.515170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.515170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.198766 sec + 6,358,773,769 cycles # 2.884 GHz + 15,492,113,204 instructions # 2.44 insn per cycle + 2.206392318 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.207007e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.360484e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.360484e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.058756 sec - 3,443,949,436 cycles:u # 3.152 GHz (75.02%) - 7,472,976 stalled-cycles-frontend:u # 0.22% frontend cycles idle (75.11%) - 916,987,954 stalled-cycles-backend:u # 26.63% backend cycles idle (75.11%) - 7,891,775,014 instructions:u # 2.29 insn per cycle - # 0.12 stalled cycles per insn (75.17%) - 1.096109008 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.787706e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.007873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007873e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.328722 sec + 3,633,771,509 cycles # 2.722 GHz + 7,954,097,743 instructions # 2.19 insn per cycle + 1.336366634 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.612179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.118037e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.118037e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.225517 sec + 3,366,927,421 cycles # 2.733 GHz + 7,347,508,752 instructions # 2.18 insn per cycle + 1.232992993 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=524288) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.960005e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.722467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.722467e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.642570 sec + 3,181,631,608 cycles # 1.930 GHz + 6,021,725,956 instructions # 1.89 insn per cycle + 1.650041277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index b9b451f7a7..38a7216065 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:53:44 +DATE: 2024-01-30_06:00:59 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.530039e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.873981e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.029409e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.011207 sec - 3,109,468,699 cycles:u # 2.989 GHz (74.64%) - 10,675,373 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.03%) - 1,155,728,733 stalled-cycles-backend:u # 37.17% backend cycles idle (75.40%) - 2,757,158,995 instructions:u # 0.89 insn per cycle - # 0.42 stalled cycles per insn (75.40%) - 1.063509800 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.412461e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.631522e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951868e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.573986 sec + 2,244,851,144 cycles # 2.822 GHz + 3,300,445,554 instructions # 1.47 insn per cycle + 0.853607464 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.984186e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.073806e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.073806e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.659266 sec - 12,698,234,408 cycles:u # 3.444 GHz (74.88%) - 7,066,710 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.98%) - 12,699,589 stalled-cycles-backend:u # 0.10% backend cycles idle (75.05%) - 37,066,954,874 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 3.688889541 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.218192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.291861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291861e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.876695 sec + 14,064,697,494 cycles # 2.884 GHz + 37,110,369,611 instructions # 2.64 insn per cycle + 4.882981134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.291162e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.715010e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.715010e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.811368 sec - 6,217,112,223 cycles:u # 3.380 GHz (74.81%) - 6,897,784 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.78%) - 2,145,885,127 stalled-cycles-backend:u # 34.52% backend cycles idle (75.00%) - 15,247,265,769 instructions:u # 2.45 insn per cycle - # 0.14 stalled cycles per insn (75.00%) - 1.841031156 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.131220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.575839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.575839e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 +TOTAL : 2.187823 sec + 6,322,431,284 cycles # 2.883 GHz + 15,223,876,723 instructions # 2.41 insn per cycle + 2.194184928 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.221639e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.378458e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.378458e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.002912 sec - 3,392,615,870 cycles:u # 3.292 GHz (74.55%) - 7,908,977 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.95%) - 905,004,374 stalled-cycles-backend:u # 26.68% backend cycles idle (75.17%) - 7,653,140,390 instructions:u # 2.26 insn per cycle - # 0.12 stalled cycles per insn (75.17%) - 1.033182842 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.948892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027773e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.320835 sec + 3,601,071,923 cycles # 2.719 GHz + 7,699,828,133 instructions # 2.14 insn per cycle + 1.327138068 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.790537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.142626e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 +TOTAL : 1.218690 sec + 3,342,798,362 cycles # 2.731 GHz + 7,059,572,278 instructions # 2.11 insn per cycle + 1.225217680 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.022088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.806836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.806836e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 +TOTAL : 1.641075 sec + 3,147,503,652 cycles # 1.912 GHz + 5,713,849,148 instructions # 1.82 insn per cycle + 1.647331874 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 0f700f95ab..cb54d3236b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,133 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:50:42 +DATE: 2024-01-30_05:57:29 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 53,570,294 cycles:u # 2.465 GHz (63.21%) - 44,370 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.22%) - 600,308 stalled-cycles-backend:u # 1.12% backend cycles idle (63.22%) - 41,293,137 instructions:u # 0.77 insn per cycle - # 0.01 stalled cycles per insn (65.24%) - 0.022677751 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.414196e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.655173e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981062e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.514856 sec + 2,089,539,478 cycles # 2.840 GHz + 3,296,506,746 instructions # 1.58 insn per cycle + 0.794530995 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted - 54,888,405 cycles:u # 2.552 GHz (62.84%) - 34,498 stalled-cycles-frontend:u # 0.06% frontend cycles idle (62.84%) - 614,835 stalled-cycles-backend:u # 1.12% backend cycles idle (62.84%) - 39,911,154 instructions:u # 0.73 insn per cycle - # 0.02 stalled cycles per insn (64.62%) - 0.022797442 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.227183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.300870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.300870e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.796202 sec + 13,896,514,461 cycles # 2.894 GHz + 37,078,595,071 instructions # 2.67 insn per cycle + 4.803618427 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted - 54,129,943 cycles:u # 2.514 GHz (62.87%) - 44,123 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.88%) - 616,440 stalled-cycles-backend:u # 1.14% backend cycles idle (62.88%) - 40,740,714 instructions:u # 0.75 insn per cycle - # 0.02 stalled cycles per insn (64.53%) - 0.022769848 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.077190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.527451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.527451e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.154367 sec + 6,177,704,022 cycles # 2.870 GHz + 15,215,532,210 instructions # 2.46 insn per cycle + 2.160620609 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted - 51,261,943 cycles:u # 2.382 GHz (62.86%) - 41,540 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.87%) - 557,588 stalled-cycles-backend:u # 1.09% backend cycles idle (62.87%) - 43,301,886 instructions:u # 0.84 insn per cycle - # 0.01 stalled cycles per insn (64.72%) - 0.022831988 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.911398e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.023141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.023141e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.265022 sec + 3,447,761,650 cycles # 2.714 GHz + 7,715,058,636 instructions # 2.24 insn per cycle + 1.271511064 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.829060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147412e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.155347 sec + 3,170,001,813 cycles # 2.731 GHz + 7,109,524,161 instructions # 2.24 insn per cycle + 1.161808340 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.999480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.774350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.774350e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.586909 sec + 2,978,718,352 cycles # 1.871 GHz + 5,762,941,941 instructions # 1.93 insn per cycle + 1.593095591 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index 874a0d9227..5939268227 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,164 +1,211 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:47:40 +DATE: 2024-01-30_05:54:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.194070e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.863045e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.019637e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.079682e+00 +- 3.408341e-03 ) GeV^0 -TOTAL : 1.126583 sec - 3,509,941,194 cycles:u # 3.030 GHz (75.16%) - 22,018,880 stalled-cycles-frontend:u # 0.63% frontend cycles idle (75.21%) - 1,148,370,067 stalled-cycles-backend:u # 32.72% backend cycles idle (75.20%) - 3,765,296,391 instructions:u # 1.07 insn per cycle - # 0.30 stalled cycles per insn (75.13%) - 1.175184575 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.468280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.632924e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.955888e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.625640 sec + 2,402,533,839 cycles # 2.841 GHz + 3,758,306,095 instructions # 1.56 insn per cycle + 0.905223049 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.984226e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.074104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.074104e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.658913 sec - 12,701,443,134 cycles:u # 3.445 GHz (74.86%) - 7,095,454 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 9,911,546 stalled-cycles-backend:u # 0.08% backend cycles idle (75.05%) - 37,066,452,270 instructions:u # 2.92 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 3.688367018 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.221197e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.294941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294941e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.809027 sec + 13,889,421,482 cycles # 2.885 GHz + 37,078,742,557 instructions # 2.67 insn per cycle + 4.815296717 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.286494e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.712387e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.712387e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.812169 sec - 6,203,283,474 cycles:u # 3.371 GHz (74.82%) - 7,177,180 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.82%) - 2,144,315,451 stalled-cycles-backend:u # 34.57% backend cycles idle (74.79%) - 15,227,021,935 instructions:u # 2.45 insn per cycle - # 0.14 stalled cycles per insn (75.01%) - 1.841698994 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.146065e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.592205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.592205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.125330 sec + 6,161,438,553 cycles # 2.892 GHz + 15,211,397,983 instructions # 2.47 insn per cycle + 2.131726868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.220443e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.376990e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.376990e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.004270 sec - 3,403,854,776 cycles:u # 3.298 GHz (74.43%) - 7,965,823 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.81%) - 905,514,489 stalled-cycles-backend:u # 26.60% backend cycles idle (75.19%) - 7,659,434,862 instructions:u # 2.25 insn per cycle - # 0.12 stalled cycles per insn (75.21%) - 1.034669678 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3055) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.991330e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.034099e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034099e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.255088 sec + 3,440,029,043 cycles # 2.730 GHz + 7,714,775,848 instructions # 2.24 insn per cycle + 1.261283713 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186294492334 -Relative difference = 1.826435805832187e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.843583e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.149362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149362e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.153935 sec + 3,172,826,861 cycles # 2.738 GHz + 7,109,210,779 instructions # 2.24 insn per cycle + 1.160268530 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.077925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.872855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.872855e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.570762 sec + 2,979,903,068 cycles # 1.891 GHz + 5,762,829,882 instructions # 1.93 insn per cycle + 1.577195857 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index d4bb6181a9..c96a0bb3db 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_13:52:05 +DATE: 2024-01-30_04:56:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.340799e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.048487e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.223095e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.012814 sec - 3,102,074,072 cycles:u # 2.978 GHz (74.84%) - 10,684,621 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.97%) - 1,156,397,766 stalled-cycles-backend:u # 37.28% backend cycles idle (75.14%) - 2,771,360,480 instructions:u # 0.89 insn per cycle - # 0.42 stalled cycles per insn (75.40%) - 1.068102599 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.421312e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.704045e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.041754e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.486618 sec + 2,018,521,842 cycles # 2.827 GHz + 2,837,894,141 instructions # 1.41 insn per cycle + 0.795623791 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.961291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.049383e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.049383e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.686242 sec - 12,771,638,095 cycles:u # 3.438 GHz (75.02%) - 7,371,044 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.02%) - 9,932,633 stalled-cycles-backend:u # 0.08% backend cycles idle (75.02%) - 37,443,030,124 instructions:u # 2.93 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 3.716660029 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.245629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320181e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.758439 sec + 13,805,800,630 cycles # 2.898 GHz + 37,480,161,839 instructions # 2.71 insn per cycle + 4.770650257 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198367925361 -Relative difference = 8.044452636897417e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.349724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.935331e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.935331e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.571428 sec - 5,356,446,210 cycles:u # 3.349 GHz (75.00%) - 7,741,465 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.00%) - 1,299,135,011 stalled-cycles-backend:u # 24.25% backend cycles idle (75.00%) - 15,197,192,844 instructions:u # 2.84 insn per cycle - # 0.09 stalled cycles per insn (75.01%) - 1.602763498 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.821274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.398672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.398672e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.889404 sec + 5,475,292,589 cycles # 2.889 GHz + 15,244,893,114 instructions # 2.78 insn per cycle + 1.908184587 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198773050681 -Relative difference = 6.047600673895608e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.892015e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.695000e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.695000e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.324222 sec - 4,525,402,442 cycles:u # 3.346 GHz (74.69%) - 8,007,297 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.71%) - 1,664,879,635 stalled-cycles-backend:u # 36.79% backend cycles idle (75.00%) - 9,811,549,184 instructions:u # 2.17 insn per cycle - # 0.17 stalled cycles per insn (75.16%) - 1.355868030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3734) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.385813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037637e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.731302 sec + 4,719,001,422 cycles # 2.717 GHz + 9,850,811,081 instructions # 2.09 insn per cycle + 1.750777348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186428369954 -Relative difference = 1.7604478492421832e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.683577e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.409489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.409489e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.659075 sec + 4,492,699,411 cycles # 2.699 GHz + 9,202,452,349 instructions # 2.05 insn per cycle + 1.671352513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.486110e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.486110e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.854854 sec + 3,463,720,216 cycles # 1.861 GHz + 6,875,040,962 instructions # 1.98 insn per cycle + 1.876340349 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183217635378 +Relative difference = 1.5859655131013432e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 61615af3b6..993f4107d6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:21:43 +DATE: 2024-01-30_05:36:48 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.735814e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.877863e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.033538e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.012208 sec - 3,119,807,941 cycles:u # 2.993 GHz (74.63%) - 10,767,069 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.63%) - 1,158,448,795 stalled-cycles-backend:u # 37.13% backend cycles idle (75.06%) - 2,770,838,012 instructions:u # 0.89 insn per cycle - # 0.42 stalled cycles per insn (75.39%) - 1.063714508 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.377362e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649325e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.974675e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.484150 sec + 2,005,186,574 cycles # 2.831 GHz + 2,872,226,914 instructions # 1.43 insn per cycle + 0.768013554 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.216757e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.321079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.321079e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.404693 sec - 11,796,914,940 cycles:u # 3.437 GHz (74.90%) - 6,850,731 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.01%) - 1,685,582,811 stalled-cycles-backend:u # 14.29% backend cycles idle (75.06%) - 34,222,491,331 instructions:u # 2.90 insn per cycle - # 0.05 stalled cycles per insn (75.07%) - 3.434705962 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.479081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.570421e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570421e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.318821 sec + 12,411,469,267 cycles # 2.871 GHz + 34,216,954,204 instructions # 2.76 insn per cycle + 4.325006925 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.229229e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.799387e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.799387e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.604851 sec - 5,480,363,361 cycles:u # 3.356 GHz (74.93%) - 7,418,885 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.02%) - 2,030,766,975 stalled-cycles-backend:u # 37.06% backend cycles idle (75.02%) - 14,594,495,372 instructions:u # 2.66 insn per cycle - # 0.14 stalled cycles per insn (75.02%) - 1.635926264 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.935196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.540988e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.540988e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.856505 sec + 5,363,525,325 cycles # 2.881 GHz + 14,587,825,944 instructions # 2.72 insn per cycle + 1.863141926 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198769558221 -Relative difference = 6.06481491495597e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288192580919713 +Relative difference = 1.2721291123071246e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.445478e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035633e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035633e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.255073 sec - 4,255,529,094 cycles:u # 3.316 GHz (75.10%) - 7,973,246 stalled-cycles-frontend:u # 0.19% frontend cycles idle (75.07%) - 1,640,245,882 stalled-cycles-backend:u # 38.54% backend cycles idle (75.07%) - 9,026,902,760 instructions:u # 2.12 insn per cycle - # 0.18 stalled cycles per insn (75.07%) - 1.286698152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4485) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.475828e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.385170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.385170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.492139 sec + 4,058,079,431 cycles # 2.710 GHz + 9,088,895,483 instructions # 2.24 insn per cycle + 1.498802038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186752004549 -Relative difference = 1.6009291367898262e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.052179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.125609e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.125609e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.390912 sec + 3,795,132,868 cycles # 2.718 GHz + 8,440,638,214 instructions # 2.22 insn per cycle + 1.397579629 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.426211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.889827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.889827e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 2.021883 sec + 3,727,709,927 cycles # 1.839 GHz + 7,572,021,248 instructions # 2.03 insn per cycle + 2.028341317 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183350348845 +Relative difference = 1.6513796936156652e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 30d2d52191..2891f046ff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_14:22:04 +DATE: 2024-01-30_05:37:12 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.949611e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.045213e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.219863e+08 ) sec^-1 -MeanMatrixElemValue = ( 2.080169e+00 +- 3.463853e-03 ) GeV^0 -TOTAL : 1.014593 sec - 3,118,954,529 cycles:u # 2.988 GHz (74.59%) - 10,853,428 stalled-cycles-frontend:u # 0.35% frontend cycles idle (74.98%) - 1,156,319,559 stalled-cycles-backend:u # 37.07% backend cycles idle (75.43%) - 2,823,007,789 instructions:u # 0.91 insn per cycle - # 0.41 stalled cycles per insn (75.43%) - 1.067855681 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.485748e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689974e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.027356e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.482102 sec + 1,996,662,355 cycles # 2.812 GHz + 2,850,200,230 instructions # 1.43 insn per cycle + 0.768087139 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 2.028815e+00 -Avg ME (F77/CUDA) = 2.0288173652952537 -Relative difference = 1.1658506339321586e-06 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.443408e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.563240e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.563240e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.189746 sec - 11,035,685,759 cycles:u # 3.430 GHz (74.89%) - 7,526,740 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.91%) - 246,540,718 stalled-cycles-backend:u # 2.23% backend cycles idle (74.91%) - 35,471,888,877 instructions:u # 3.21 insn per cycle - # 0.01 stalled cycles per insn (74.97%) - 3.219755111 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.596095e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.696763e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.696763e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.127183 sec + 11,946,394,247 cycles # 2.891 GHz + 35,407,075,530 instructions # 2.96 insn per cycle + 4.133301161 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199088536203 Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.747202e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.406647e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.406647e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404713e-03 ) GeV^0 -TOTAL : 1.497749 sec - 5,106,890,668 cycles:u # 3.347 GHz (74.93%) - 7,729,401 stalled-cycles-frontend:u # 0.15% frontend cycles idle (74.84%) - 1,357,822,804 stalled-cycles-backend:u # 26.59% backend cycles idle (74.62%) - 14,100,437,399 instructions:u # 2.76 insn per cycle - # 0.10 stalled cycles per insn (74.62%) - 1.529202877 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.250434e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.927787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.927787e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.766919 sec + 5,069,845,731 cycles # 2.861 GHz + 14,044,971,447 instructions # 2.77 insn per cycle + 1.773365949 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288198892958462 -Relative difference = 5.4565783974899003e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028819e+00 +Avg ME (F77/C++) = 2.0288192554144189 +Relative difference = 1.2589315209891237e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.016100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.123106e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.123106e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079551e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.176337 sec - 3,983,849,774 cycles:u # 3.308 GHz (74.50%) - 6,512,233 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) - 1,443,046,357 stalled-cycles-backend:u # 36.22% backend cycles idle (75.03%) - 8,629,208,977 instructions:u # 2.17 insn per cycle - # 0.17 stalled cycles per insn (75.10%) - 1.207645395 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3406) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.492213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492213e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.476405 sec + 3,988,953,115 cycles # 2.692 GHz + 8,629,569,798 instructions # 2.16 insn per cycle + 1.482936821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288186836987734 -Relative difference = 1.559041129563128e-07 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.210818e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.331985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.331985e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.366166 sec + 3,694,176,022 cycles # 2.694 GHz + 8,100,845,822 instructions # 2.19 insn per cycle + 1.372646371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.670710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.170464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.170464e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.938240 sec + 3,580,879,514 cycles # 1.843 GHz + 7,373,942,234 instructions # 2.06 insn per cycle + 1.944698982 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028818e+00 +Avg ME (F77/C++) = 2.0288183569209650 +Relative difference = 1.7592557106041962e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index a82d3154e4..26cb412a69 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_13:52:25 +DATE: 2024-01-30_04:56:57 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.794432e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.005141e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060273e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.064603 sec - 3,215,836,002 cycles:u # 2.927 GHz (74.53%) - 10,653,381 stalled-cycles-frontend:u # 0.33% frontend cycles idle (74.84%) - 568,951,210 stalled-cycles-backend:u # 17.69% backend cycles idle (75.30%) - 2,976,432,260 instructions:u # 0.93 insn per cycle - # 0.19 stalled cycles per insn (75.26%) - 1.119979873 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.567190e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153367e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271156e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.531785 sec + 2,166,596,506 cycles # 2.818 GHz + 3,096,992,570 instructions # 1.43 insn per cycle + 0.839064322 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.478378e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.541836e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.541836e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.412755 sec - 15,231,146,764 cycles:u # 3.426 GHz (75.00%) - 10,099,136 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.99%) - 136,784,882 stalled-cycles-backend:u # 0.90% backend cycles idle (74.99%) - 39,286,082,116 instructions:u # 2.58 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 4.448390218 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.035137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.096372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096372e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.263186 sec + 15,248,441,904 cycles # 2.894 GHz + 39,293,765,746 instructions # 2.58 insn per cycle + 5.273287972 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.548558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.780380e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.780380e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.488012 sec - 8,458,483,870 cycles:u # 3.355 GHz (74.94%) - 9,397,002 stalled-cycles-frontend:u # 0.11% frontend cycles idle (74.96%) - 888,302,300 stalled-cycles-backend:u # 10.50% backend cycles idle (74.96%) - 24,127,794,576 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (74.94%) - 2.525312813 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.565129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766484e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.050997 sec + 8,847,131,595 cycles # 2.894 GHz + 24,093,216,326 instructions # 2.72 insn per cycle + 3.069927720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.851235e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.471021e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.471021e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.518408 sec - 5,089,441,147 cycles:u # 3.279 GHz (74.75%) - 9,358,375 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.74%) - 466,224,607 stalled-cycles-backend:u # 9.16% backend cycles idle (74.91%) - 11,400,997,607 instructions:u # 2.24 insn per cycle - # 0.04 stalled cycles per insn (75.17%) - 1.555951668 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2451) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.446912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.914435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.914435e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.035886 sec + 5,501,574,982 cycles # 2.694 GHz + 11,449,152,902 instructions # 2.08 insn per cycle + 2.052044507 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.398707e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.055840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.750074 sec + 4,773,598,492 cycles # 2.718 GHz + 10,317,257,525 instructions # 2.16 insn per cycle + 1.763056572 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.115786e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.377584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.377584e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.659793 sec + 4,851,599,101 cycles # 1.820 GHz + 7,367,812,046 instructions # 1.52 insn per cycle + 2.678537528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index dc3e5431af..3aadf8f9be 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,164 +1,209 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-01-31_13:52:48 +DATE: 2024-01-30_04:57:25 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.773626e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.913093e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.965935e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.064090 sec - 3,185,169,658 cycles:u # 2.901 GHz (75.25%) - 10,700,271 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.27%) - 1,150,129,221 stalled-cycles-backend:u # 36.11% backend cycles idle (75.27%) - 2,952,946,614 instructions:u # 0.93 insn per cycle - # 0.39 stalled cycles per insn (75.06%) - 1.123660960 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.571537e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158030e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273800e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.527677 sec + 2,187,527,722 cycles # 2.838 GHz + 3,113,906,107 instructions # 1.42 insn per cycle + 0.843196902 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 2.028807e+00 -Avg ME (F77/CUDA) = 2.0288063423243869 -Relative difference = 3.241686434838304e-07 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.426536e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.487007e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.487007e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.502472 sec - 15,540,927,135 cycles:u # 3.427 GHz (74.95%) - 9,844,079 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 19,078,127 stalled-cycles-backend:u # 0.12% backend cycles idle (74.96%) - 40,163,707,232 instructions:u # 2.58 insn per cycle - # 0.00 stalled cycles per insn (74.96%) - 4.538214277 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.053597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.114429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.114429e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.216554 sec + 15,076,935,035 cycles # 2.887 GHz + 40,115,062,840 instructions # 2.66 insn per cycle + 5.225437216 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.510516e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.740202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.740202e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.506920 sec - 8,569,629,800 cycles:u # 3.373 GHz (74.81%) - 10,871,179 stalled-cycles-frontend:u # 0.13% frontend cycles idle (74.86%) - 651,048,749 stalled-cycles-backend:u # 7.60% backend cycles idle (75.01%) - 23,430,751,066 instructions:u # 2.73 insn per cycle - # 0.03 stalled cycles per insn (75.13%) - 2.544191965 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.498695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.695294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695294e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.107625 sec + 8,698,982,275 cycles # 2.794 GHz + 23,534,504,437 instructions # 2.71 insn per cycle + 3.124975720 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.852986e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.321455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.321455e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.713280 sec - 5,760,398,352 cycles:u # 3.298 GHz (74.89%) - 9,611,570 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.84%) - 719,104,526 stalled-cycles-backend:u # 12.48% backend cycles idle (74.81%) - 13,118,757,379 instructions:u # 2.28 insn per cycle - # 0.05 stalled cycles per insn (74.84%) - 1.750550085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2695) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.826638e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.191418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.191418e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.282934 sec + 6,198,059,216 cycles # 2.708 GHz + 13,103,377,766 instructions # 2.11 insn per cycle + 2.300648997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063930599014 Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.224417e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.653642e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.653642e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.117622 sec + 5,754,647,700 cycles # 2.709 GHz + 12,210,180,073 instructions # 2.12 insn per cycle + 2.133681313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.752218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971190e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.905718 sec + 5,261,261,771 cycles # 1.807 GHz + 8,449,535,603 instructions # 1.61 insn per cycle + 2.918034623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 2.028807e+00 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 2c4872a3ef..93e04f110e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_13:53:11 +DATE: 2024-01-30_04:57:55 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.881857e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.034056e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.041695e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.535207 sec - 1,561,352,723 cycles:u # 2.771 GHz (75.68%) - 7,936,172 stalled-cycles-frontend:u # 0.51% frontend cycles idle (76.46%) - 276,377,827 stalled-cycles-backend:u # 17.70% backend cycles idle (75.95%) - 1,807,555,937 instructions:u # 1.16 insn per cycle - # 0.15 stalled cycles per insn (75.19%) - 0.583404745 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.751466e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044991e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059567e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.471853 sec + 1,938,631,197 cycles # 2.818 GHz + 2,775,429,754 instructions # 1.43 insn per cycle + 0.768838740 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.577002e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.847336e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.852472e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.137772 sec - 3,521,764,321 cycles:u # 2.997 GHz (74.34%) - 21,218,659 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.56%) - 853,108,865 stalled-cycles-backend:u # 24.22% backend cycles idle (74.82%) - 3,256,826,481 instructions:u # 0.92 insn per cycle - # 0.26 stalled cycles per insn (74.81%) - 1.195192794 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.083310e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323559e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.337755e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.612248 sec + 2,402,912,694 cycles # 2.815 GHz + 3,669,599,520 instructions # 1.53 insn per cycle + 0.914185147 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.949926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.962209e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.962209e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.577914 sec - 19,611,893,072 cycles:u # 3.502 GHz (74.99%) - 2,551,919 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 3,431,866,251 stalled-cycles-backend:u # 17.50% backend cycles idle (75.00%) - 57,909,311,784 instructions:u # 2.95 insn per cycle - # 0.06 stalled cycles per insn (75.00%) - 5.602758302 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.436781e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.449292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.449292e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.748293 sec + 19,527,368,133 cycles # 2.892 GHz + 57,921,410,950 instructions # 2.97 insn per cycle + 6.756473501 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.035773e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.086870e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.086870e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.738169 sec - 9,650,787,663 cycles:u # 3.496 GHz (74.83%) - 2,652,787 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 2,369,218,335 stalled-cycles-backend:u # 24.55% backend cycles idle (75.08%) - 29,963,506,375 instructions:u # 3.10 insn per cycle - # 0.08 stalled cycles per insn (75.09%) - 2.764116067 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.689715e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.736371e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.736371e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.517761 sec + 10,204,769,485 cycles # 2.897 GHz + 29,944,325,485 instructions # 2.93 insn per cycle + 3.533017528 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.238244e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.259535e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.259535e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.346669 sec - 4,753,803,853 cycles:u # 3.472 GHz (74.88%) - 2,041,888 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) - 1,473,673,367 stalled-cycles-backend:u # 31.00% backend cycles idle (74.88%) - 11,252,845,250 instructions:u # 2.37 insn per cycle - # 0.13 stalled cycles per insn (74.72%) - 1.372622722 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.110539e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.290286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.290286e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.822880 sec + 4,929,256,319 cycles # 2.697 GHz + 11,212,094,634 instructions # 2.27 insn per cycle + 1.842452367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.045459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.068242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068242e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.591153 sec + 4,310,771,194 cycles # 2.701 GHz + 10,188,135,001 instructions # 2.36 insn per cycle + 1.604477930 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.350984e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.465337e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.465337e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.255127 sec + 3,913,955,092 cycles # 1.732 GHz + 5,709,470,043 instructions # 1.46 insn per cycle + 2.269083887 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 5f5164ce2d..ec4707eb36 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_14:40:14 +DATE: 2024-01-30_05:47:33 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.466419e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.968594e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.968594e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.563248 sec - 1,634,603,369 cycles:u # 2.762 GHz (75.78%) - 10,357,414 stalled-cycles-frontend:u # 0.63% frontend cycles idle (75.67%) - 290,360,114 stalled-cycles-backend:u # 17.76% backend cycles idle (75.69%) - 2,033,614,753 instructions:u # 1.24 insn per cycle - # 0.14 stalled cycles per insn (75.10%) - 0.614864085 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.528893e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.736864e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.736864e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.499721 sec + 2,019,319,467 cycles # 2.834 GHz + 3,049,308,251 instructions # 1.51 insn per cycle + 0.770515897 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.195303e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674915e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674915e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.265744 sec - 3,836,823,091 cycles:u # 2.922 GHz (75.03%) - 30,357,682 stalled-cycles-frontend:u # 0.79% frontend cycles idle (75.10%) - 857,532,035 stalled-cycles-backend:u # 22.35% backend cycles idle (75.29%) - 3,845,436,342 instructions:u # 1.00 insn per cycle - # 0.22 stalled cycles per insn (75.29%) - 1.331728087 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.631733e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.469522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.469522e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.838079 sec + 3,105,645,423 cycles # 2.841 GHz + 4,885,001,867 instructions # 1.57 insn per cycle + 1.151319170 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.938856e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.951070e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.951070e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.602623 sec - 19,677,012,387 cycles:u # 3.498 GHz (74.97%) - 2,756,857 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 3,430,818,000 stalled-cycles-backend:u # 17.44% backend cycles idle (74.97%) - 57,888,320,227 instructions:u # 2.94 insn per cycle - # 0.06 stalled cycles per insn (74.97%) - 5.627403169 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.430928e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.443345e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.443345e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.772792 sec + 19,550,332,735 cycles # 2.885 GHz + 57,928,238,854 instructions # 2.96 insn per cycle + 6.778111068 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.033146e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.084267e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.084267e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.743942 sec - 9,647,917,917 cycles:u # 3.487 GHz (74.85%) - 2,560,334 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.87%) - 2,372,794,359 stalled-cycles-backend:u # 24.59% backend cycles idle (74.95%) - 30,019,587,369 instructions:u # 3.11 insn per cycle - # 0.08 stalled cycles per insn (75.07%) - 2.770430958 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.642090e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.688492e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.688492e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.562259 sec + 10,259,962,003 cycles # 2.883 GHz + 29,997,071,393 instructions # 2.92 insn per cycle + 3.567805037 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.236121e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.257315e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.257315e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.353591 sec - 4,776,745,975 cycles:u # 3.470 GHz (75.08%) - 2,320,254 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.73%) - 1,475,503,524 stalled-cycles-backend:u # 30.89% backend cycles idle (74.73%) - 11,254,402,988 instructions:u # 2.36 insn per cycle - # 0.13 stalled cycles per insn (75.01%) - 1.379911947 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4378) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.060333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.240360e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.240360e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.842606 sec + 4,975,429,359 cycles # 2.695 GHz + 11,262,132,806 instructions # 2.26 insn per cycle + 1.848498494 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.064837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064837e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.605579 sec + 4,356,497,896 cycles # 2.706 GHz + 10,236,092,665 instructions # 2.35 insn per cycle + 1.611218031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.341333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.457820e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.457820e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.270029 sec + 3,960,771,261 cycles # 1.743 GHz + 5,748,864,563 instructions # 1.45 insn per cycle + 2.275659808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index c89c4acdab..e0fcb209a0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_13:53:35 +DATE: 2024-01-30_04:58:25 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.862441e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.015054e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.020911e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.533285 sec - 1,578,933,550 cycles:u # 2.806 GHz (74.51%) - 7,789,260 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.59%) - 272,593,452 stalled-cycles-backend:u # 17.26% backend cycles idle (75.74%) - 1,818,439,514 instructions:u # 1.15 insn per cycle - # 0.15 stalled cycles per insn (75.78%) - 0.578653790 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.715814e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042075e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056833e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.470883 sec + 1,939,912,503 cycles # 2.822 GHz + 2,790,884,564 instructions # 1.44 insn per cycle + 0.765236939 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.546447e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.812875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.817892e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.134405 sec - 3,499,370,566 cycles:u # 2.987 GHz (74.50%) - 21,125,895 stalled-cycles-frontend:u # 0.60% frontend cycles idle (74.69%) - 853,546,816 stalled-cycles-backend:u # 24.39% backend cycles idle (74.75%) - 3,209,634,213 instructions:u # 0.92 insn per cycle - # 0.27 stalled cycles per insn (74.74%) - 1.190728884 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.074401e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309128e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323134e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.606508 sec + 2,399,848,951 cycles # 2.837 GHz + 3,558,977,452 instructions # 1.48 insn per cycle + 0.907497861 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213684418642 -Relative difference = 4.4692399933517674e-07 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.937626e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.949805e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.949805e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.600925 sec - 19,689,549,010 cycles:u # 3.502 GHz (74.96%) - 2,204,846 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 2,983,226,749 stalled-cycles-backend:u # 15.15% backend cycles idle (74.96%) - 57,700,780,946 instructions:u # 2.93 insn per cycle - # 0.05 stalled cycles per insn (74.96%) - 5.625400034 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.442527e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.455052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.455052e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.732470 sec + 19,518,863,765 cycles # 2.898 GHz + 57,747,544,085 instructions # 2.96 insn per cycle + 6.739693684 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432431 Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.955815e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.005495e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.005495e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.774401 sec - 9,772,730,996 cycles:u # 3.494 GHz (74.88%) - 2,329,022 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.75%) - 2,288,097,457 stalled-cycles-backend:u # 23.41% backend cycles idle (74.89%) - 30,363,583,450 instructions:u # 3.11 insn per cycle - # 0.08 stalled cycles per insn (75.12%) - 2.800409321 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.661123e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.707073e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.707073e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.538386 sec + 10,268,038,737 cycles # 2.898 GHz + 30,334,584,369 instructions # 2.95 insn per cycle + 3.554140482 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.198043e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.217941e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.217941e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.390870 sec - 4,932,527,405 cycles:u # 3.490 GHz (74.74%) - 2,111,451 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.93%) - 1,676,406,494 stalled-cycles-backend:u # 33.99% backend cycles idle (75.10%) - 11,671,634,167 instructions:u # 2.37 insn per cycle - # 0.14 stalled cycles per insn (75.10%) - 1.416813284 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4471) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.842618e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.012045e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.012045e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.876874 sec + 5,068,616,518 cycles # 2.693 GHz + 11,664,707,542 instructions # 2.30 insn per cycle + 1.896780245 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.766097e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.969139e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.969139e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.701579 sec + 4,623,474,911 cycles # 2.710 GHz + 10,806,178,257 instructions # 2.34 insn per cycle + 1.712732749 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.261988e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.377447e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.377447e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.282726 sec + 3,962,643,032 cycles # 1.733 GHz + 5,999,265,657 instructions # 1.51 insn per cycle + 2.297742409 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213684416484 +Relative difference = 4.469241520660492e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 9d749af286..809c0d4a45 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_13:54:00 +DATE: 2024-01-30_04:58:56 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.233207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.295589e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.365340e+06 ) sec^-1 -MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 -TOTAL : 0.467908 sec - 1,320,505,833 cycles:u # 2.660 GHz (75.02%) - 8,142,141 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.97%) - 274,045,440 stalled-cycles-backend:u # 20.75% backend cycles idle (75.77%) - 1,686,814,296 instructions:u # 1.28 insn per cycle - # 0.16 stalled cycles per insn (74.73%) - 0.515396159 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.450759e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.307242e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.403943e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.453655 sec + 1,885,130,441 cycles # 2.809 GHz + 2,653,723,410 instructions # 1.41 insn per cycle + 0.747134110 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.299597e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.630371e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.635274e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 -TOTAL : 0.959215 sec - 2,923,861,212 cycles:u # 2.949 GHz (74.96%) - 21,308,476 stalled-cycles-frontend:u # 0.73% frontend cycles idle (74.83%) - 855,007,726 stalled-cycles-backend:u # 29.24% backend cycles idle (74.83%) - 2,774,819,087 instructions:u # 0.95 insn per cycle - # 0.31 stalled cycles per insn (74.95%) - 1.011833195 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.211065e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.390139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.474767e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.497953 sec + 2,053,184,300 cycles # 2.823 GHz + 2,862,941,904 instructions # 1.39 insn per cycle + 0.785494017 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.235316e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.250553e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.250553e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.085842 sec - 17,880,394,593 cycles:u # 3.501 GHz (74.94%) - 2,390,994 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 3,699,282,902 stalled-cycles-backend:u # 20.69% backend cycles idle (74.94%) - 55,296,875,849 instructions:u # 3.09 insn per cycle - # 0.07 stalled cycles per insn (74.96%) - 5.110215731 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.619709e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.634289e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.276631 sec + 18,176,411,104 cycles # 2.894 GHz + 55,238,700,170 instructions # 3.04 insn per cycle + 6.284146623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.080505e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.097747e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.097747e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.537653 sec - 5,440,452,169 cycles:u # 3.487 GHz (74.94%) - 2,164,642 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.88%) - 1,664,978,467 stalled-cycles-backend:u # 30.60% backend cycles idle (74.88%) - 16,160,698,434 instructions:u # 2.97 insn per cycle - # 0.10 stalled cycles per insn (74.88%) - 1.563750461 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.447433e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.602543e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.602543e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.961648 sec + 5,691,843,956 cycles # 2.895 GHz + 16,128,541,176 instructions # 2.83 insn per cycle + 1.980848485 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.355325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.435129e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.435129e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.716897 sec - 2,573,702,457 cycles:u # 3.481 GHz (74.68%) - 2,112,380 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.99%) - 824,658,506 stalled-cycles-backend:u # 32.04% backend cycles idle (75.12%) - 6,101,403,334 instructions:u # 2.37 insn per cycle - # 0.14 stalled cycles per insn (75.12%) - 0.742580939 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.757867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823085e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.954501 sec + 2,591,810,421 cycles # 2.702 GHz + 6,085,915,267 instructions # 2.35 insn per cycle + 0.966912682 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.986474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.069956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.069956e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.846832 sec + 2,295,114,840 cycles # 2.696 GHz + 5,552,751,365 instructions # 2.42 insn per cycle + 0.861502194 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.460942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.506292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.506292e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.145570 sec + 2,022,184,795 cycles # 1.758 GHz + 3,286,748,929 instructions # 1.63 insn per cycle + 1.163999883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index 5b3fc6c5c0..8f1e29c773 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_14:40:39 +DATE: 2024-01-30_05:48:04 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.247862e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.496785e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.496785e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.755516e+02 +- 2.671055e+02 ) GeV^-2 -TOTAL : 0.504260 sec - 1,461,991,234 cycles:u # 2.779 GHz (74.45%) - 10,944,552 stalled-cycles-frontend:u # 0.75% frontend cycles idle (74.09%) - 252,397,308 stalled-cycles-backend:u # 17.26% backend cycles idle (75.56%) - 1,861,835,376 instructions:u # 1.27 insn per cycle - # 0.14 stalled cycles per insn (76.18%) - 0.550688201 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.794241e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.099961e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.099961e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.464854 sec + 1,913,128,801 cycles # 2.831 GHz + 2,814,269,280 instructions # 1.47 insn per cycle + 0.735191571 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.122959e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.467701e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.467701e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.855934e+03 +- 1.791981e+03 ) GeV^-2 -TOTAL : 1.067243 sec - 3,269,617,170 cycles:u # 2.964 GHz (74.39%) - 30,130,595 stalled-cycles-frontend:u # 0.92% frontend cycles idle (74.58%) - 861,248,742 stalled-cycles-backend:u # 26.34% backend cycles idle (74.61%) - 3,480,018,755 instructions:u # 1.06 insn per cycle - # 0.25 stalled cycles per insn (74.93%) - 1.122752145 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.563056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567773e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567773e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.649992 sec + 2,514,728,119 cycles # 2.840 GHz + 3,857,856,675 instructions # 1.53 insn per cycle + 0.945286461 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669530965212 -Relative difference = 0.0005401804983001964 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.234859e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.250186e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.250186e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.088747 sec - 17,884,436,952 cycles:u # 3.499 GHz (74.96%) - 2,327,651 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.96%) - 3,681,573,463 stalled-cycles-backend:u # 20.59% backend cycles idle (74.96%) - 55,280,013,344 instructions:u # 3.09 insn per cycle - # 0.07 stalled cycles per insn (74.91%) - 5.113324972 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.612332e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.626818e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.626818e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.299552 sec + 18,207,767,275 cycles # 2.889 GHz + 55,242,943,760 instructions # 3.03 insn per cycle + 6.304483382 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.070830e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.087647e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.087647e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.554368 sec - 5,491,975,937 cycles:u # 3.483 GHz (74.77%) - 2,162,040 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.15%) - 1,708,931,547 stalled-cycles-backend:u # 31.12% backend cycles idle (75.14%) - 16,215,779,203 instructions:u # 2.95 insn per cycle - # 0.11 stalled cycles per insn (75.15%) - 1.580187360 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.365917e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.522444e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.522444e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.986287 sec + 5,717,011,577 cycles # 2.873 GHz + 16,175,954,346 instructions # 2.83 insn per cycle + 1.991587162 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857118325333 -Relative difference = 2.039421953066926e-07 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.357776e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.438301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.438301e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.718857 sec - 2,561,351,001 cycles:u # 3.454 GHz (74.82%) - 2,067,002 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.19%) - 820,609,810 stalled-cycles-backend:u # 32.04% backend cycles idle (75.19%) - 6,136,329,384 instructions:u # 2.40 insn per cycle - # 0.13 stalled cycles per insn (75.20%) - 0.744904138 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4860) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.741687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.807547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.807547e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.968315 sec + 2,618,792,433 cycles # 2.693 GHz + 6,122,206,815 instructions # 2.34 insn per cycle + 0.973667021 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.976057e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.060749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060749e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.855993 sec + 2,321,654,642 cycles # 2.699 GHz + 5,589,002,861 instructions # 2.41 insn per cycle + 0.861171520 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.455132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.500155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.500155e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.154878 sec + 2,044,999,339 cycles # 1.765 GHz + 3,327,504,110 instructions # 1.63 insn per cycle + 1.160035358 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index c5078df4b5..71f99cc0f9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_13:54:22 +DATE: 2024-01-30_04:59:21 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.163509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.451426e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.501420e+06 ) sec^-1 -MeanMatrixElemValue = ( 5.334114e+02 +- 3.089427e+02 ) GeV^-2 -TOTAL : 0.472511 sec - 1,392,395,013 cycles:u # 2.784 GHz (73.56%) - 8,100,702 stalled-cycles-frontend:u # 0.58% frontend cycles idle (74.41%) - 276,496,565 stalled-cycles-backend:u # 19.86% backend cycles idle (74.42%) - 1,721,815,009 instructions:u # 1.24 insn per cycle - # 0.16 stalled cycles per insn (75.78%) - 0.517855915 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.454028e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326749e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.426065e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.451891 sec + 1,884,922,304 cycles # 2.826 GHz + 2,675,153,942 instructions # 1.42 insn per cycle + 0.742708600 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.325083e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685923e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.691170e+07 ) sec^-1 -MeanMatrixElemValue = ( 2.954952e+03 +- 1.880090e+03 ) GeV^-2 -TOTAL : 0.958757 sec - 2,928,770,710 cycles:u # 2.956 GHz (75.00%) - 21,347,446 stalled-cycles-frontend:u # 0.73% frontend cycles idle (74.80%) - 855,040,551 stalled-cycles-backend:u # 29.19% backend cycles idle (74.80%) - 2,819,342,051 instructions:u # 0.96 insn per cycle - # 0.30 stalled cycles per insn (74.98%) - 1.011115588 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.211971e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.383449e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.465566e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.498447 sec + 2,066,378,383 cycles # 2.841 GHz + 2,912,189,828 instructions # 1.41 insn per cycle + 0.785425719 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 1.412404e+00 -Avg ME (F77/CUDA) = 1.4131669531526541 -Relative difference = 0.0005401805380429868 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 3.235914e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.251183e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.251183e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 5.084718 sec - 17,882,683,220 cycles:u # 3.502 GHz (74.94%) - 2,375,550 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 2,987,825,837 stalled-cycles-backend:u # 16.71% backend cycles idle (74.94%) - 55,049,728,854 instructions:u # 3.08 insn per cycle - # 0.05 stalled cycles per insn (74.97%) - 5.109246107 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.621420e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635929e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635929e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.272536 sec + 18,133,908,438 cycles # 2.889 GHz + 54,991,536,969 instructions # 3.03 insn per cycle + 6.280002049 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412998e+00 -Avg ME (F77/C++) = 1.4129978146120550 -Relative difference = 1.3120184529301602e-07 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.118880e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.137296e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.137296e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724763e+02 +- 2.665342e+02 ) GeV^-2 -TOTAL : 1.485425 sec - 5,266,166,102 cycles:u # 3.492 GHz (74.86%) - 2,220,595 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.03%) - 1,527,619,356 stalled-cycles-backend:u # 29.01% backend cycles idle (75.07%) - 16,255,209,513 instructions:u # 3.09 insn per cycle - # 0.09 stalled cycles per insn (75.07%) - 1.511090515 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.675526e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.845155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.845155e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.910252 sec + 5,541,476,355 cycles # 2.894 GHz + 16,222,950,904 instructions # 2.93 insn per cycle + 1.926546393 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129857712652836 -Relative difference = 1.618803841657786e-07 +Avg ME (F77/C++) = 1.4129863487235070 +Relative difference = 2.4679898241023883e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.120936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.185424e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.185424e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743733e+02 +- 2.676611e+02 ) GeV^-2 -TOTAL : 0.793589 sec - 2,820,836,759 cycles:u # 3.457 GHz (74.52%) - 2,303,382 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.59%) - 801,753,862 stalled-cycles-backend:u # 28.42% backend cycles idle (74.91%) - 6,748,831,142 instructions:u # 2.39 insn per cycle - # 0.12 stalled cycles per insn (75.34%) - 0.819143454 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5412) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.524928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.573974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.573974e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.096795 sec + 2,981,881,341 cycles # 2.708 GHz + 6,708,240,605 instructions # 2.25 insn per cycle + 1.109848469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133162680784324 -Relative difference = 1.896804623606238e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.679205e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.738776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.738776e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.997879 sec + 2,711,169,290 cycles # 2.704 GHz + 6,222,713,478 instructions # 2.30 insn per cycle + 1.012945753 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.374736e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.414577e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.414577e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.216016 sec + 2,159,440,418 cycles # 1.769 GHz + 3,642,249,109 instructions # 1.69 insn per cycle + 1.228978695 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413316e+00 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index bf09fe872a..c3bf1d184f 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_13:54:44 +DATE: 2024-01-30_04:59:47 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.873525e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.030688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.036136e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.533324 sec - 1,558,673,708 cycles:u # 2.775 GHz (74.98%) - 7,904,191 stalled-cycles-frontend:u # 0.51% frontend cycles idle (75.72%) - 278,417,769 stalled-cycles-backend:u # 17.86% backend cycles idle (75.85%) - 1,806,640,294 instructions:u # 1.16 insn per cycle - # 0.15 stalled cycles per insn (75.92%) - 0.583004824 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.711100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041363e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056144e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.470768 sec + 1,937,905,575 cycles # 2.825 GHz + 2,769,085,725 instructions # 1.43 insn per cycle + 0.764309757 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.613599e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.845121e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.850461e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.136287 sec - 3,465,854,034 cycles:u # 2.952 GHz (74.81%) - 21,431,149 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.72%) - 855,206,102 stalled-cycles-backend:u # 24.68% backend cycles idle (75.04%) - 3,115,833,555 instructions:u # 0.90 insn per cycle - # 0.27 stalled cycles per insn (75.48%) - 1.194221843 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.077034e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326400e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.614129 sec + 2,415,403,751 cycles # 2.830 GHz + 3,662,132,699 instructions # 1.52 insn per cycle + 0.915037914 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.885755e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.897495e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.897495e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.701399 sec - 20,036,013,217 cycles:u # 3.501 GHz (74.98%) - 2,942,876 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 3,907,613,044 stalled-cycles-backend:u # 19.50% backend cycles idle (74.98%) - 59,165,739,340 instructions:u # 2.95 insn per cycle - # 0.07 stalled cycles per insn (74.98%) - 5.725948879 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.370924e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.382846e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382846e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.936117 sec + 19,978,394,912 cycles # 2.879 GHz + 59,162,561,873 instructions # 2.96 insn per cycle + 6.944191465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.095746e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.147754e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.147754e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.711476 sec - 9,548,524,839 cycles:u # 3.492 GHz (74.85%) - 2,483,907 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.86%) - 2,429,422,838 stalled-cycles-backend:u # 25.44% backend cycles idle (74.95%) - 29,789,093,599 instructions:u # 3.12 insn per cycle - # 0.08 stalled cycles per insn (75.07%) - 2.737506225 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.694585e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.741387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.741387e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.513998 sec + 10,104,341,088 cycles # 2.872 GHz + 29,763,867,436 instructions # 2.95 insn per cycle + 3.532062820 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.242709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.264174e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.264174e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.341938 sec - 4,749,950,097 cycles:u # 3.481 GHz (74.86%) - 2,985,260 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.79%) - 1,573,333,068 stalled-cycles-backend:u # 33.12% backend cycles idle (74.79%) - 11,219,420,879 instructions:u # 2.36 insn per cycle - # 0.14 stalled cycles per insn (74.84%) - 1.367798392 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4563) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.157849e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.336120e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.336120e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.813275 sec + 4,888,809,789 cycles # 2.689 GHz + 11,200,775,616 instructions # 2.29 insn per cycle + 1.831194346 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.059295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083013e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.571072 sec + 4,240,948,322 cycles # 2.691 GHz + 10,146,075,765 instructions # 2.39 insn per cycle + 1.585395140 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.157625e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.268151e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.268151e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.315387 sec + 4,011,221,101 cycles # 1.729 GHz + 5,838,969,816 instructions # 1.46 insn per cycle + 2.328222904 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index cca92229f9..0465a21327 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-01-31_13:55:09 +DATE: 2024-01-30_05:00:18 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.872976e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.021933e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.027485e+06 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.531980 sec - 1,602,270,971 cycles:u # 2.853 GHz (74.22%) - 7,715,458 stalled-cycles-frontend:u # 0.48% frontend cycles idle (75.00%) - 261,350,265 stalled-cycles-backend:u # 16.31% backend cycles idle (75.44%) - 1,855,168,819 instructions:u # 1.16 insn per cycle - # 0.14 stalled cycles per insn (75.52%) - 0.579289170 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.666023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032901e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046936e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.468226 sec + 1,937,873,508 cycles # 2.824 GHz + 2,759,069,461 instructions # 1.42 insn per cycle + 0.754370762 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.600681e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.840185e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.845245e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.948724e+03 +- 1.840727e+03 ) GeV^-2 -TOTAL : 1.131599 sec - 3,445,297,026 cycles:u # 2.948 GHz (74.64%) - 21,298,915 stalled-cycles-frontend:u # 0.62% frontend cycles idle (74.96%) - 851,127,019 stalled-cycles-backend:u # 24.70% backend cycles idle (75.40%) - 3,162,433,332 instructions:u # 0.92 insn per cycle - # 0.27 stalled cycles per insn (75.37%) - 1.190120861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.070939e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304690e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318717e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607643 sec + 2,403,195,178 cycles # 2.827 GHz + 3,555,740,714 instructions # 1.48 insn per cycle + 0.909921855 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 1.413122e+00 -Avg ME (F77/CUDA) = 1.4131213755569483 -Relative difference = 4.4188898885662695e-07 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.882570e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.894309e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.894309e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.707488 sec - 20,069,251,028 cycles:u # 3.503 GHz (74.97%) - 2,635,071 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 3,505,383,952 stalled-cycles-backend:u # 17.47% backend cycles idle (75.01%) - 58,708,728,731 instructions:u # 2.93 insn per cycle - # 0.06 stalled cycles per insn (75.01%) - 5.732080718 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.404971e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.417275e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.417275e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.840299 sec + 19,736,673,501 cycles # 2.886 GHz + 58,709,690,472 instructions # 2.97 insn per cycle + 6.847451518 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 6.161494e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.214740e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.214740e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.682706 sec - 9,449,572,209 cycles:u # 3.492 GHz (74.88%) - 2,338,158 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.87%) - 2,118,122,832 stalled-cycles-backend:u # 22.42% backend cycles idle (74.89%) - 30,196,934,771 instructions:u # 3.20 insn per cycle - # 0.07 stalled cycles per insn (75.01%) - 2.709003612 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.708829e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.755468e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.755468e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.503073 sec + 10,118,973,746 cycles # 2.885 GHz + 30,158,905,101 instructions # 2.98 insn per cycle + 3.519090284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.219089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.239793e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.239793e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.367376 sec - 4,836,893,280 cycles:u # 3.480 GHz (74.74%) - 2,207,379 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.53%) - 1,569,612,446 stalled-cycles-backend:u # 32.45% backend cycles idle (74.82%) - 11,665,002,188 instructions:u # 2.41 insn per cycle - # 0.13 stalled cycles per insn (75.26%) - 1.393332297 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4667) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.784663e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.950747e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.950747e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.889159 sec + 5,039,949,395 cycles # 2.661 GHz + 11,663,409,755 instructions # 2.31 insn per cycle + 1.981495827 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.838137e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.004758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004758e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.689521 sec + 4,555,347,979 cycles # 2.689 GHz + 10,787,640,248 instructions # 2.37 insn per cycle + 1.702819632 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.077813e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.181685e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.181685e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.340568 sec + 4,064,413,524 cycles # 1.733 GHz + 6,073,601,897 instructions # 1.49 insn per cycle + 2.356439472 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.413122e+00 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 65cb87aab5..53bd28a5bd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_13:55:33 +DATE: 2024-01-30_05:00:48 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.342079e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.525537e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.526983e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.654104 sec - 1,995,712,276 cycles:u # 2.936 GHz (74.74%) - 2,415,481 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.62%) - 34,360,604 stalled-cycles-backend:u # 1.72% backend cycles idle (75.39%) - 2,158,157,518 instructions:u # 1.08 insn per cycle - # 0.02 stalled cycles per insn (75.40%) - 0.705404958 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.507010e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.536029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.538733e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.531942 sec + 2,193,462,671 cycles # 2.834 GHz + 3,356,973,773 instructions # 1.53 insn per cycle + 0.849346656 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.229184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.236148e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.236208e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.451568 sec - 29,043,491,099 cycles:u # 3.428 GHz (74.98%) - 11,890,412 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) - 1,124,011,436 stalled-cycles-backend:u # 3.87% backend cycles idle (74.98%) - 22,740,172,566 instructions:u # 0.78 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 8.512180841 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.126743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.160620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162100e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.043092 sec + 9,489,937,514 cycles # 2.875 GHz + 19,463,317,431 instructions # 2.05 insn per cycle + 3.359518634 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.214670e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.215541e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.215541e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.412895 sec - 26,034,504,301 cycles:u # 3.502 GHz (74.94%) - 9,000,011 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 3,852,087,436 stalled-cycles-backend:u # 14.80% backend cycles idle (75.02%) - 81,739,960,011 instructions:u # 3.14 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 7.437382060 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.787937e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.788754e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.788754e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.184792 sec + 26,445,376,310 cycles # 2.879 GHz + 81,759,262,253 instructions # 3.09 insn per cycle + 9.200099621 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.984166e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.988749e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.988749e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.298200 sec - 11,606,582,111 cycles:u # 3.495 GHz (74.97%) - 4,624,697 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.95%) - 1,735,045,451 stalled-cycles-backend:u # 14.95% backend cycles idle (74.95%) - 39,243,660,592 instructions:u # 3.38 insn per cycle - # 0.04 stalled cycles per insn (74.95%) - 3.324118895 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.595033e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598347e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598347e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.572055 sec + 12,894,491,420 cycles # 2.818 GHz + 39,242,650,330 instructions # 3.04 insn per cycle + 4.588188651 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.191588e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194143e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.194143e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.384178 sec - 4,884,932,429 cycles:u # 3.472 GHz (74.98%) - 3,982,190 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.98%) - 594,600,674 stalled-cycles-backend:u # 12.17% backend cycles idle (74.98%) - 13,805,085,145 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 1.409990161 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.988905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.005063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.005063e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.062926 sec + 5,559,157,847 cycles # 2.689 GHz + 13,789,744,695 instructions # 2.48 insn per cycle + 2.079268197 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.113130e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.134504e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.134504e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.809806 sec + 4,899,980,729 cycles # 2.701 GHz + 12,319,200,932 instructions # 2.51 insn per cycle + 1.824526773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.926484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.938620e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.938620e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.377893 sec + 4,078,713,187 cycles # 1.712 GHz + 6,287,612,851 instructions # 1.54 insn per cycle + 2.391138362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 63a4f3691e..ba45d149aa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:41:30 +DATE: 2024-01-30_05:49:05 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.314396e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.453907e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.453907e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.659716 sec - 1,965,200,705 cycles:u # 2.867 GHz (75.35%) - 2,789,644 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.45%) - 37,818,859 stalled-cycles-backend:u # 1.92% backend cycles idle (75.55%) - 2,155,673,862 instructions:u # 1.10 insn per cycle - # 0.02 stalled cycles per insn (75.18%) - 0.707979793 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.099881e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.447326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447326e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.520843 sec + 2,128,361,154 cycles # 2.833 GHz + 3,379,769,914 instructions # 1.59 insn per cycle + 0.811374229 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.208093e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.242810e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.242810e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.553727 sec - 29,338,209,795 cycles:u # 3.410 GHz (75.00%) - 22,527,728 stalled-cycles-frontend:u # 0.08% frontend cycles idle (75.00%) - 1,131,409,563 stalled-cycles-backend:u # 3.86% backend cycles idle (75.00%) - 23,563,180,220 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 8.626260229 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.602295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.096469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.096469e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.329754 sec + 10,358,756,104 cycles # 2.872 GHz + 22,944,085,739 instructions # 2.21 insn per cycle + 3.663397648 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.220774e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.221672e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.221672e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.396401 sec - 25,960,894,256 cycles:u # 3.499 GHz (74.98%) - 1,712,367 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 3,860,397,243 stalled-cycles-backend:u # 14.87% backend cycles idle (74.98%) - 81,732,223,719 instructions:u # 3.15 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 7.421348797 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.794771e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795632e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.153536 sec + 26,441,951,782 cycles # 2.888 GHz + 81,759,972,796 instructions # 3.09 insn per cycle + 9.158879879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.017866e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.022441e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.022441e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.280001 sec - 11,548,271,811 cycles:u # 3.496 GHz (74.86%) - 851,534 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.05%) - 1,666,698,780 stalled-cycles-backend:u # 14.43% backend cycles idle (75.06%) - 39,231,416,956 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 3.306510540 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.577595e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.580993e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580993e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.598491 sec + 12,916,287,273 cycles # 2.806 GHz + 39,254,753,938 instructions # 3.04 insn per cycle + 4.603937867 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.200156e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.202733e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.202733e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.378042 sec - 4,849,581,934 cycles:u # 3.462 GHz (74.88%) - 1,287,264 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.88%) - 601,990,040 stalled-cycles-backend:u # 12.41% backend cycles idle (74.88%) - 13,817,429,650 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (74.92%) - 1.404318947 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.852795e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.869019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.869019e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.103011 sec + 5,568,678,671 cycles # 2.642 GHz + 13,799,771,926 instructions # 2.48 insn per cycle + 2.108561686 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.035305e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.056800e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.056800e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.829572 sec + 4,921,598,332 cycles # 2.684 GHz + 12,328,469,851 instructions # 2.50 insn per cycle + 1.835230648 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.926825e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.939647e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.939647e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.382359 sec + 4,075,002,441 cycles # 1.707 GHz + 6,297,411,526 instructions # 1.55 insn per cycle + 2.387952463 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index b374ab3593..2624aa384f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:54:05 +DATE: 2024-01-30_06:01:23 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.329430e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.511862e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.512447e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.497090e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.524372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.526818e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.651783 sec - 1,962,849,327 cycles:u # 2.904 GHz (75.27%) - 2,413,372 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.16%) - 33,678,909 stalled-cycles-backend:u # 1.72% backend cycles idle (75.16%) - 2,163,070,926 instructions:u # 1.10 insn per cycle - # 0.02 stalled cycles per insn (75.35%) - 0.694638226 seconds time elapsed +TOTAL : 0.512897 sec + 2,098,983,374 cycles # 2.834 GHz + 3,277,353,449 instructions # 1.56 insn per cycle + 0.803360908 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.240071e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.242972e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.243032e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.141054e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.174803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176256e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.410192 sec - 28,921,900,074 cycles:u # 3.425 GHz (75.02%) - 11,771,288 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.04%) - 1,138,813,727 stalled-cycles-backend:u # 3.94% backend cycles idle (74.98%) - 22,751,204,493 instructions:u # 0.79 insn per cycle - # 0.05 stalled cycles per insn (74.97%) - 8.471415614 seconds time elapsed +TOTAL : 3.134385 sec + 9,742,325,189 cycles # 2.872 GHz + 21,219,396,735 instructions # 2.18 insn per cycle + 3.451782991 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.223034e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.223927e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.223927e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.789322e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.790133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.790133e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.385145 sec - 25,940,859,836 cycles:u # 3.502 GHz (74.95%) - 1,545,950 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 4,038,405,772 stalled-cycles-backend:u # 15.57% backend cycles idle (74.95%) - 81,778,395,890 instructions:u # 3.15 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 7.409443065 seconds time elapsed +TOTAL : 9.178912 sec + 26,467,085,384 cycles # 2.885 GHz + 81,758,395,147 instructions # 3.09 insn per cycle + 9.184185479 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.018482e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.023068e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.023068e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.580434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.583873e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.583873e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.275778 sec - 11,537,497,698 cycles:u # 3.498 GHz (74.95%) - 829,315 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 1,692,036,026 stalled-cycles-backend:u # 14.67% backend cycles idle (75.02%) - 39,243,269,509 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.300021140 seconds time elapsed +TOTAL : 4.592041 sec + 12,908,303,532 cycles # 2.809 GHz + 39,241,301,392 instructions # 3.04 insn per cycle + 4.597199751 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.191885e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.194427e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.194427e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.006274e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.022952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.022952e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.383808 sec - 4,888,153,917 cycles:u # 3.477 GHz (74.97%) - 3,032,785 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.97%) - 617,814,695 stalled-cycles-backend:u # 12.64% backend cycles idle (74.97%) - 13,805,762,990 instructions:u # 2.82 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 1.407829447 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +TOTAL : 2.060575 sec + 5,561,277,799 cycles # 2.694 GHz + 13,787,529,346 instructions # 2.48 insn per cycle + 2.065507699 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.108001e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.130506e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.130506e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 1.812494 sec + 4,903,037,786 cycles # 2.699 GHz + 12,315,866,756 instructions # 2.51 insn per cycle + 1.817504411 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.888313e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.900941e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.900941e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 2.393126 sec + 4,056,497,728 cycles # 1.692 GHz + 6,284,230,028 instructions # 1.55 insn per cycle + 2.398190383 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index fd6be47ed8..711141aac6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,143 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:50:55 +DATE: 2024-01-30_05:57:54 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.493459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.521487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524248e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.509745 sec + 2,103,800,649 cycles # 2.836 GHz + 3,325,789,020 instructions # 1.58 insn per cycle + 0.801087014 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 55,467,149 cycles:u # 2.539 GHz (63.41%) - 45,969 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.41%) - 520,435 stalled-cycles-backend:u # 0.94% backend cycles idle (63.41%) - 43,374,922 instructions:u # 0.78 insn per cycle - # 0.01 stalled cycles per insn (59.26%) - 0.022778455 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 50,274,667 cycles:u # 2.342 GHz (62.76%) - 46,338 stalled-cycles-frontend:u # 0.09% frontend cycles idle (62.77%) - 274,830 stalled-cycles-backend:u # 0.55% backend cycles idle (62.77%) - 44,684,416 instructions:u # 0.89 insn per cycle - # 0.01 stalled cycles per insn (64.88%) - 0.022435061 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.145753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.180203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.181702e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.079125 sec + 9,600,126,952 cycles # 2.878 GHz + 21,681,876,510 instructions # 2.26 insn per cycle + 3.393235673 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted - 56,504,472 cycles:u # 2.636 GHz (62.71%) - 39,699 stalled-cycles-frontend:u # 0.07% frontend cycles idle (62.72%) - 638,001 stalled-cycles-backend:u # 1.13% backend cycles idle (62.72%) - 42,217,033 instructions:u # 0.75 insn per cycle - # 0.02 stalled cycles per insn (59.30%) - 0.022705545 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.797758e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.798583e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.798583e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.135643 sec + 26,454,582,305 cycles # 2.896 GHz + 81,754,058,548 instructions # 3.09 insn per cycle + 9.140745485 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted - 55,692,026 cycles:u # 2.595 GHz (62.76%) - 42,533 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.76%) - 538,603 stalled-cycles-backend:u # 0.97% backend cycles idle (62.76%) - 39,503,692 instructions:u # 0.71 insn per cycle - # 0.01 stalled cycles per insn (64.30%) - 0.022723669 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.597207e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600539e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600539e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.569387 sec + 12,892,653,048 cycles # 2.819 GHz + 39,241,760,724 instructions # 3.04 insn per cycle + 4.574378716 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted - 53,931,580 cycles:u # 2.499 GHz (62.96%) - 45,686 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.96%) - 607,688 stalled-cycles-backend:u # 1.13% backend cycles idle (62.96%) - 40,872,369 instructions:u # 0.76 insn per cycle - # 0.01 stalled cycles per insn (64.59%) - 0.022908456 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.978625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.995447e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.995447e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.065609 sec + 5,559,302,417 cycles # 2.687 GHz + 13,789,202,442 instructions # 2.48 insn per cycle + 2.071000161 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.097696e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.119952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.119952e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.812868 sec + 4,896,837,509 cycles # 2.695 GHz + 12,317,770,581 instructions # 2.52 insn per cycle + 1.818257681 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.967466e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.979997e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.979997e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.364567 sec + 4,060,623,360 cycles # 1.715 GHz + 6,286,167,500 instructions # 1.55 insn per cycle + 2.369629620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index d2d9dea879..de6151d7b3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,181 +1,226 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:48:00 +DATE: 2024-01-30_05:54:28 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.265352e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.394013e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.395125e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.666267 sec - 1,997,556,313 cycles:u # 2.914 GHz (75.49%) - 2,860,510 stalled-cycles-frontend:u # 0.14% frontend cycles idle (75.50%) - 33,947,641 stalled-cycles-backend:u # 1.70% backend cycles idle (75.50%) - 2,182,905,169 instructions:u # 1.09 insn per cycle - # 0.02 stalled cycles per insn (75.43%) - 0.710655270 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.181803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.496640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499302e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.513215 sec + 2,110,243,378 cycles # 2.841 GHz + 3,364,158,559 instructions # 1.59 insn per cycle + 0.803846009 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.213332e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.245422e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.245484e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.499679 sec - 29,303,278,877 cycles:u # 3.431 GHz (74.92%) - 22,905,012 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.97%) - 1,135,894,583 stalled-cycles-backend:u # 3.88% backend cycles idle (75.00%) - 23,523,863,711 instructions:u # 0.80 insn per cycle - # 0.05 stalled cycles per insn (75.01%) - 8.558405499 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.724341e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178501e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.211617 sec + 9,930,008,722 cycles # 2.863 GHz + 21,629,593,771 instructions # 2.18 insn per cycle + 3.536993543 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.216718e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.217614e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.217614e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.406194 sec - 26,035,489,997 cycles:u # 3.505 GHz (74.96%) - 1,701,351 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 3,983,610,228 stalled-cycles-backend:u # 15.30% backend cycles idle (75.02%) - 81,732,369,026 instructions:u # 3.14 insn per cycle - # 0.05 stalled cycles per insn (75.02%) - 7.430419339 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.795958e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796814e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.796814e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.141774 sec + 26,442,082,623 cycles # 2.892 GHz + 81,755,899,902 instructions # 3.09 insn per cycle + 9.146895276 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.018118e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.022700e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.022700e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.276016 sec - 11,535,823,383 cycles:u # 3.497 GHz (74.95%) - 846,680 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 1,656,392,067 stalled-cycles-backend:u # 14.36% backend cycles idle (75.02%) - 39,237,814,528 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.02%) - 3.300280855 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.584252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.587667e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.587667e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.585664 sec + 12,903,354,074 cycles # 2.812 GHz + 39,243,037,589 instructions # 3.04 insn per cycle + 4.591083081 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.202443e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205027e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205027e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.371746 sec - 4,849,521,512 cycles:u # 3.479 GHz (74.75%) - 702,228 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.75%) - 579,718,051 stalled-cycles-backend:u # 11.95% backend cycles idle (74.79%) - 13,837,586,039 instructions:u # 2.85 insn per cycle - # 0.04 stalled cycles per insn (74.95%) - 1.395711173 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11041) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.993074e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.009513e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.009513e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.061753 sec + 5,556,410,491 cycles # 2.690 GHz + 13,788,754,708 instructions # 2.48 insn per cycle + 2.066810636 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.089775e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.111272e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.111272e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.814284 sec + 4,898,229,262 cycles # 2.694 GHz + 12,317,871,193 instructions # 2.51 insn per cycle + 1.819291757 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.893591e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.906421e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.906421e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.392306 sec + 4,056,818,337 cycles # 1.695 GHz + 6,287,135,022 instructions # 1.55 insn per cycle + 2.397424437 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 5b91a0822e..ce8b9bfd9b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_13:56:10 +DATE: 2024-01-30_05:01:27 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.385794e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.447460e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.448035e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.530879 sec - 1,589,166,725 cycles:u # 2.860 GHz (73.97%) - 2,465,598 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.37%) - 33,426,297 stalled-cycles-backend:u # 2.10% backend cycles idle (75.53%) - 1,838,676,697 instructions:u # 1.16 insn per cycle - # 0.02 stalled cycles per insn (75.54%) - 0.578463301 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.464704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496312e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.530379 sec + 2,191,459,528 cycles # 2.836 GHz + 3,378,194,635 instructions # 1.54 insn per cycle + 0.862349447 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.741198e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.747669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.747790e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 7.025241 sec - 24,087,705,387 cycles:u # 3.411 GHz (74.98%) - 11,687,983 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) - 1,123,050,418 stalled-cycles-backend:u # 4.66% backend cycles idle (74.97%) - 18,923,158,094 instructions:u # 0.79 insn per cycle - # 0.06 stalled cycles per insn (75.02%) - 7.086487861 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.136041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.170363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.171805e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.033894 sec + 9,468,330,012 cycles # 2.874 GHz + 21,262,061,450 instructions # 2.25 insn per cycle + 3.350179318 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.211410e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.212300e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.212300e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.423786 sec - 26,074,419,907 cycles:u # 3.502 GHz (74.97%) - 10,403,579 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.97%) - 3,413,804,917 stalled-cycles-backend:u # 13.09% backend cycles idle (74.97%) - 81,804,879,740 instructions:u # 3.14 insn per cycle - # 0.04 stalled cycles per insn (74.94%) - 7.448423564 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.798336e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799213e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.130694 sec + 26,439,863,153 cycles # 2.895 GHz + 81,781,637,155 instructions # 3.09 insn per cycle + 9.163718345 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.002702e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.007290e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.007290e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.285875 sec - 11,567,697,257 cycles:u # 3.496 GHz (74.88%) - 1,070,210 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) - 1,512,067,030 stalled-cycles-backend:u # 13.07% backend cycles idle (74.93%) - 39,276,843,409 instructions:u # 3.40 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 3.312043369 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.559639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.562995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.562995e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.616847 sec + 12,919,257,236 cycles # 2.796 GHz + 39,249,733,665 instructions # 3.04 insn per cycle + 4.636578065 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.190584e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193150e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.193150e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.385155 sec - 4,903,114,933 cycles:u # 3.483 GHz (75.00%) - 1,622,594 stalled-cycles-frontend:u # 0.03% frontend cycles idle (75.00%) - 598,423,933 stalled-cycles-backend:u # 12.20% backend cycles idle (75.00%) - 13,813,010,627 instructions:u # 2.82 insn per cycle - # 0.04 stalled cycles per insn (75.00%) - 1.410944151 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11030) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.030089e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.046612e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.046612e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.052281 sec + 5,556,604,473 cycles # 2.701 GHz + 13,805,088,947 instructions # 2.48 insn per cycle + 2.071717259 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157309E-004 Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.135265e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.157006e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.157006e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.804981 sec + 4,885,090,375 cycles # 2.700 GHz + 12,330,030,988 instructions # 2.52 insn per cycle + 1.821790981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.917661e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.930225e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.930225e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.381269 sec + 4,053,625,505 cycles # 1.699 GHz + 6,293,972,632 instructions # 1.55 insn per cycle + 2.398074513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index a9fbe8bd9d..466f11943e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:22:23 +DATE: 2024-01-30_05:37:36 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.316764e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.519436e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.520510e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.655608 sec - 1,965,790,092 cycles:u # 2.900 GHz (75.21%) - 2,480,952 stalled-cycles-frontend:u # 0.13% frontend cycles idle (75.29%) - 33,557,267 stalled-cycles-backend:u # 1.71% backend cycles idle (75.04%) - 2,195,486,875 instructions:u # 1.12 insn per cycle - # 0.02 stalled cycles per insn (74.43%) - 0.703005371 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.224805e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.249067e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252384e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.539977 sec + 2,169,108,553 cycles # 2.827 GHz + 3,309,870,321 instructions # 1.53 insn per cycle + 0.827427226 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.236213e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.239204e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.239260e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.496923 sec - 29,197,774,810 cycles:u # 3.429 GHz (75.02%) - 11,878,663 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.00%) - 1,389,627,915 stalled-cycles-backend:u # 4.76% backend cycles idle (74.98%) - 22,780,351,175 instructions:u # 0.78 insn per cycle - # 0.06 stalled cycles per insn (75.00%) - 8.558710646 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.771192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.799798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801021e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.309705 sec + 10,258,793,873 cycles # 2.876 GHz + 23,623,503,831 instructions # 2.30 insn per cycle + 3.624842760 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.563607e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.563978e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.563978e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 35.944813 sec - 126,032,172,389 cycles:u # 3.504 GHz (74.99%) - 88,793,755 stalled-cycles-frontend:u # 0.07% frontend cycles idle (75.00%) - 18,058,658,215 stalled-cycles-backend:u # 14.33% backend cycles idle (75.00%) - 141,495,624,349 instructions:u # 1.12 insn per cycle - # 0.13 stalled cycles per insn (75.00%) - 35.969446771 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21543) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.186471e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186937e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186937e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.185516 sec + 112,945,518,025 cycles # 2.882 GHz + 141,519,786,794 instructions # 1.25 insn per cycle + 39.190901211 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.578797e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.581112e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.581112e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.590582 sec - 16,141,671,363 cycles:u # 3.499 GHz (74.96%) - 6,645,999 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.03%) - 5,778,102,062 stalled-cycles-backend:u # 35.80% backend cycles idle (75.03%) - 37,534,757,291 instructions:u # 2.33 insn per cycle - # 0.15 stalled cycles per insn (75.03%) - 4.616614862 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.072790e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.075243e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.075243e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.347436 sec + 14,950,247,924 cycles # 2.794 GHz + 37,533,141,644 instructions # 2.51 insn per cycle + 5.352716029 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.354714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.364357e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.364357e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.237879 sec - 7,889,725,427 cycles:u # 3.490 GHz (74.91%) - 9,840,862 stalled-cycles-frontend:u # 0.12% frontend cycles idle (74.88%) - 4,384,220,733 stalled-cycles-backend:u # 55.57% backend cycles idle (74.88%) - 12,967,933,363 instructions:u # 1.64 insn per cycle - # 0.34 stalled cycles per insn (74.90%) - 2.263613144 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46575) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.349404e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.363561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.363561e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.242056 sec + 6,032,020,393 cycles # 2.685 GHz + 12,947,712,227 instructions # 2.15 insn per cycle + 2.247452761 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.895381e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.916043e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.916043e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.857617 sec + 4,999,907,297 cycles # 2.689 GHz + 11,364,404,504 instructions # 2.27 insn per cycle + 1.863061758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.220172e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.234094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.234094e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.282224 sec + 3,899,980,695 cycles # 1.706 GHz + 5,854,430,419 instructions # 1.50 insn per cycle + 2.287473513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 06761d6418..5156a1b6a3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:23:33 +DATE: 2024-01-30_05:38:48 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.356237e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.417719e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.417952e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.536538 sec - 1,571,606,709 cycles:u # 2.804 GHz (74.36%) - 2,305,832 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.60%) - 38,025,331 stalled-cycles-backend:u # 2.42% backend cycles idle (75.67%) - 1,821,956,477 instructions:u # 1.16 insn per cycle - # 0.02 stalled cycles per insn (75.57%) - 0.583015065 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.248555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276174e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.534739 sec + 2,168,342,582 cycles # 2.838 GHz + 3,393,710,794 instructions # 1.57 insn per cycle + 0.822006178 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.740825e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.746480e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.746592e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 7.028717 sec - 24,122,519,384 cycles:u # 3.413 GHz (75.01%) - 11,778,428 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) - 1,137,368,186 stalled-cycles-backend:u # 4.71% backend cycles idle (75.00%) - 19,029,939,825 instructions:u # 0.79 insn per cycle - # 0.06 stalled cycles per insn (74.99%) - 7.090955914 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.787191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.816239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.817488e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.282278 sec + 10,172,932,501 cycles # 2.876 GHz + 20,641,658,708 instructions # 2.03 insn per cycle + 3.596374566 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 -Avg ME (F77/CUDA) = 6.6266731198158101E-004 -Relative difference = 2.837296517127185e-07 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 4.559718e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.560092e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.560092e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 35.974834 sec - 126,130,590,034 cycles:u # 3.504 GHz (75.00%) - 24,529,804 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.00%) - 19,190,115,318 stalled-cycles-backend:u # 15.21% backend cycles idle (75.00%) - 141,679,486,407 instructions:u # 1.12 insn per cycle - # 0.14 stalled cycles per insn (75.00%) - 35.999556566 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21831) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.152053e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.152498e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.152498e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 39.509903 sec + 113,989,864,763 cycles # 2.886 GHz + 141,709,117,860 instructions # 1.24 insn per cycle + 39.515181315 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140461E-004 Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 3.638494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.640842e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.640842e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.515199 sec - 15,865,964,729 cycles:u # 3.497 GHz (74.97%) - 4,260,249 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.97%) - 7,290,072,550 stalled-cycles-backend:u # 45.95% backend cycles idle (74.97%) - 37,630,400,088 instructions:u # 2.37 insn per cycle - # 0.19 stalled cycles per insn (74.90%) - 4.541354971 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.077703e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.080226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.080226e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.341972 sec + 14,900,472,017 cycles # 2.788 GHz + 37,594,155,695 instructions # 2.52 insn per cycle + 5.347186768 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141220E-004 Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 7.729319e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.740074e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.740074e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.129602 sec - 7,509,265,676 cycles:u # 3.489 GHz (74.86%) - 764,819 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.02%) - 4,248,991,619 stalled-cycles-backend:u # 56.58% backend cycles idle (75.10%) - 12,850,957,521 instructions:u # 1.71 insn per cycle - # 0.33 stalled cycles per insn (75.10%) - 2.155477884 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45645) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.479123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.493428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.493428e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.203003 sec + 5,937,038,542 cycles # 2.690 GHz + 12,831,821,287 instructions # 2.16 insn per cycle + 2.208347742 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156778E-004 Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.959391e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.980227e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.980227e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.840604 sec + 4,989,362,539 cycles # 2.704 GHz + 11,359,801,014 instructions # 2.28 insn per cycle + 1.846082122 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.264695e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.278525e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.278525e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.267781 sec + 3,893,427,498 cycles # 1.714 GHz + 5,843,815,532 instructions # 1.50 insn per cycle + 2.273034135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266731198156789E-004 +Relative difference = 2.837296715097453e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 096085a906..aecab864cd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_13:56:44 +DATE: 2024-01-30_05:02:06 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.416294e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687411e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.688288e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.442494 sec - 1,237,425,658 cycles:u # 2.645 GHz (74.37%) - 2,811,171 stalled-cycles-frontend:u # 0.23% frontend cycles idle (75.38%) - 45,430,504 stalled-cycles-backend:u # 3.67% backend cycles idle (75.91%) - 1,560,108,059 instructions:u # 1.26 insn per cycle - # 0.03 stalled cycles per insn (75.76%) - 0.491868540 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.329622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387810e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.486367 sec + 1,996,254,093 cycles # 2.831 GHz + 2,951,017,935 instructions # 1.48 insn per cycle + 0.792595596 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.684493e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.714250e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.714674e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.319944 sec - 11,188,825,919 cycles:u # 3.338 GHz (74.87%) - 27,886,740 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.92%) - 1,148,242,085 stalled-cycles-backend:u # 10.26% backend cycles idle (74.94%) - 9,094,703,062 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (74.93%) - 3.374531897 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.619469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.695026e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.698446e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.718352 sec + 5,604,348,056 cycles # 2.870 GHz + 11,484,891,091 instructions # 2.05 insn per cycle + 2.010002941 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.452205e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.453244e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.453244e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.693914 sec - 23,513,432,599 cycles:u # 3.501 GHz (74.99%) - 1,337,526 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 2,827,462,762 stalled-cycles-backend:u # 12.02% backend cycles idle (74.99%) - 75,882,073,474 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 6.718000010 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.963446e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964435e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964435e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.362127 sec + 24,202,873,915 cycles # 2.893 GHz + 75,878,244,924 instructions # 3.14 insn per cycle + 8.372784572 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.889595e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.906996e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.906996e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.664449 sec - 5,885,960,259 cycles:u # 3.489 GHz (74.92%) - 775,103 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) - 868,726,269 stalled-cycles-backend:u # 14.76% backend cycles idle (74.87%) - 20,132,161,373 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (74.87%) - 1.690018521 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.122204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.135618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.135618e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.311249 sec + 6,498,315,380 cycles # 2.806 GHz + 20,115,878,445 instructions # 3.10 insn per cycle + 2.327706318 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.336602e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.346692e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.346692e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.708050 sec - 2,530,864,472 cycles:u # 3.463 GHz (74.93%) - 722,207 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.83%) - 253,709,095 stalled-cycles-backend:u # 10.02% backend cycles idle (74.83%) - 7,056,928,449 instructions:u # 2.79 insn per cycle - # 0.04 stalled cycles per insn (74.83%) - 0.734034664 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.585863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592266e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592266e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.042848 sec + 2,820,748,390 cycles # 2.693 GHz + 7,038,277,049 instructions # 2.50 insn per cycle + 1.060611053 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805764e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814413e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.916917 sec + 2,479,527,909 cycles # 2.691 GHz + 6,280,728,930 instructions # 2.53 insn per cycle + 0.937569165 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.395801e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.400853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400853e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.183787 sec + 2,037,112,677 cycles # 1.714 GHz + 3,249,000,234 instructions # 1.59 insn per cycle + 1.203517458 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 8812663b4a..cfd5bd9f60 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:42:06 +DATE: 2024-01-30_05:49:44 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.511327e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.675698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.675698e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.445455 sec - 1,282,528,041 cycles:u # 2.719 GHz (74.08%) - 3,250,260 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.46%) - 33,737,049 stalled-cycles-backend:u # 2.63% backend cycles idle (74.58%) - 1,622,372,256 instructions:u # 1.26 insn per cycle - # 0.02 stalled cycles per insn (75.52%) - 0.495132750 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.575134e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.304295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.304295e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.470892 sec + 1,938,590,562 cycles # 2.832 GHz + 2,932,139,577 instructions # 1.51 insn per cycle + 0.742517096 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.253302e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.695076e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.695076e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.453691 sec - 11,542,236,235 cycles:u # 3.309 GHz (75.01%) - 38,052,369 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.03%) - 1,137,356,098 stalled-cycles-backend:u # 9.85% backend cycles idle (75.03%) - 9,943,319,874 instructions:u # 0.86 insn per cycle - # 0.11 stalled cycles per insn (74.97%) - 3.511572406 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.189558e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.483327e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.483327e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.911946 sec + 6,179,624,048 cycles # 2.874 GHz + 12,701,880,125 instructions # 2.06 insn per cycle + 2.209063416 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.451008e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.452057e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.452057e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.699230 sec - 23,539,412,014 cycles:u # 3.502 GHz (74.98%) - 1,325,927 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 2,764,954,908 stalled-cycles-backend:u # 11.75% backend cycles idle (75.01%) - 75,882,013,709 instructions:u # 3.22 insn per cycle - # 0.04 stalled cycles per insn (75.01%) - 6.723781000 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.966573e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967552e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967552e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.353267 sec + 24,210,307,332 cycles # 2.898 GHz + 75,882,231,103 instructions # 3.13 insn per cycle + 8.358202878 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.884448e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.902255e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.902255e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.667427 sec - 5,876,182,869 cycles:u # 3.476 GHz (74.92%) - 755,115 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.92%) - 885,630,582 stalled-cycles-backend:u # 15.07% backend cycles idle (74.92%) - 20,136,475,777 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (74.93%) - 1.693626257 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.010932e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.023878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.023878e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.350780 sec + 6,507,988,967 cycles # 2.764 GHz + 20,124,211,431 instructions # 3.09 insn per cycle + 2.355993372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.350229e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.360492e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.360492e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.705971 sec - 2,503,907,768 cycles:u # 3.438 GHz (74.78%) - 805,820 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.74%) - 250,706,028 stalled-cycles-backend:u # 10.01% backend cycles idle (74.74%) - 7,078,577,500 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (74.84%) - 0.731878359 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.585110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.591932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.591932e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.046222 sec + 2,830,060,229 cycles # 2.694 GHz + 7,047,238,365 instructions # 2.49 insn per cycle + 1.051506977 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805765e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814390e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814390e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.919776 sec + 2,488,595,721 cycles # 2.693 GHz + 6,289,461,030 instructions # 2.53 insn per cycle + 0.925186931 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.390787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.395884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.191044 sec + 2,045,888,825 cycles # 1.712 GHz + 3,258,286,239 instructions # 1.59 insn per cycle + 1.196330024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index b57e941bfc..18818d76f2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:54:41 +DATE: 2024-01-30_06:02:02 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.484047e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.683097e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.684518e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.462588 sec - 1,263,747,733 cycles:u # 2.709 GHz (75.37%) - 2,821,312 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.27%) - 33,721,485 stalled-cycles-backend:u # 2.67% backend cycles idle (74.04%) - 1,629,313,626 instructions:u # 1.29 insn per cycle - # 0.02 stalled cycles per insn (74.32%) - 0.505592407 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.319163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.372298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.378244e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.470285 sec + 1,953,187,910 cycles # 2.826 GHz + 2,879,626,230 instructions # 1.47 insn per cycle + 0.750725256 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.679193e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.711323e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.711744e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.315193 sec - 11,143,533,112 cycles:u # 3.330 GHz (74.93%) - 28,071,485 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.90%) - 1,147,162,310 stalled-cycles-backend:u # 10.29% backend cycles idle (74.97%) - 8,994,649,560 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.11%) - 3.366939054 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.571852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.645768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649137e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.807519 sec + 5,850,952,354 cycles # 2.861 GHz + 11,909,032,858 instructions # 2.04 insn per cycle + 2.113707665 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.453118e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.454148e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.454148e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.964186e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.965186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965186e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.691448 sec - 23,508,412,612 cycles:u # 3.502 GHz (74.98%) - 1,325,577 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 2,760,550,099 stalled-cycles-backend:u # 11.74% backend cycles idle (74.98%) - 75,863,951,980 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.98%) - 6.715424956 seconds time elapsed +TOTAL : 8.362848 sec + 24,219,340,843 cycles # 2.896 GHz + 75,878,803,024 instructions # 3.13 insn per cycle + 8.367752014 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.874463e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.891747e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.891747e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.667003 sec - 5,891,590,353 cycles:u # 3.488 GHz (74.90%) - 696,983 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.90%) - 876,571,432 stalled-cycles-backend:u # 14.88% backend cycles idle (74.90%) - 20,133,226,301 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (74.91%) - 1.690910397 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.106063e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.119817e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.119817e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 2.317534 sec + 6,502,161,706 cycles # 2.801 GHz + 20,113,148,136 instructions # 3.09 insn per cycle + 2.322610994 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.358073e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.368273e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.368273e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.701644 sec - 2,504,924,592 cycles:u # 3.462 GHz (74.58%) - 477,115 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.58%) - 242,307,045 stalled-cycles-backend:u # 9.67% backend cycles idle (74.58%) - 7,093,468,748 instructions:u # 2.83 insn per cycle - # 0.03 stalled cycles per insn (74.76%) - 0.725253014 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.586948e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.593562e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.593562e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 1.043186 sec + 2,822,730,977 cycles # 2.696 GHz + 7,035,059,102 instructions # 2.49 insn per cycle + 1.048122577 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.807119e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816011e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816011e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.917444 sec + 2,481,419,746 cycles # 2.693 GHz + 6,275,834,953 instructions # 2.53 insn per cycle + 0.922842065 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.399447e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.404609e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.404609e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 +TOTAL : 1.182503 sec + 2,042,245,375 cycles # 1.722 GHz + 3,246,419,225 instructions # 1.59 insn per cycle + 1.187753193 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 0f279d1f96..e0bdb664e1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,143 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:51:10 +DATE: 2024-01-30_05:58:32 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 53,660,520 cycles:u # 2.471 GHz (63.19%) - 42,812 stalled-cycles-frontend:u # 0.08% frontend cycles idle (63.19%) - 627,310 stalled-cycles-backend:u # 1.17% backend cycles idle (63.19%) - 41,361,739 instructions:u # 0.77 insn per cycle - # 0.02 stalled cycles per insn (64.98%) - 0.022613916 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.316613e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.368482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.375052e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.466730 sec + 1,919,349,851 cycles # 2.829 GHz + 2,893,848,641 instructions # 1.51 insn per cycle + 0.736177730 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe: Aborted - 44,320,877 cycles:u # 2.057 GHz (62.89%) - 57,125 stalled-cycles-frontend:u # 0.13% frontend cycles idle (62.90%) - 498,199 stalled-cycles-backend:u # 1.12% backend cycles idle (62.90%) - 46,848,203 instructions:u # 1.06 insn per cycle - # 0.01 stalled cycles per insn (69.98%) - 0.022399963 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.573214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.650131e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.756106 sec + 5,695,142,145 cycles # 2.868 GHz + 11,326,470,226 instructions # 1.99 insn per cycle + 2.046387591 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted - 59,645,586 cycles:u # 2.770 GHz (62.87%) - 31,816 stalled-cycles-frontend:u # 0.05% frontend cycles idle (62.88%) - 595,900 stalled-cycles-backend:u # 1.00% backend cycles idle (62.88%) - 37,227,581 instructions:u # 0.62 insn per cycle - # 0.02 stalled cycles per insn (62.87%) - 0.022799665 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.965319e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.966317e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.966317e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.353416 sec + 24,206,918,909 cycles # 2.897 GHz + 75,878,282,077 instructions # 3.13 insn per cycle + 8.358253425 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted - 52,420,208 cycles:u # 2.424 GHz (63.04%) - 35,421 stalled-cycles-frontend:u # 0.07% frontend cycles idle (63.05%) - 608,287 stalled-cycles-backend:u # 1.16% backend cycles idle (63.05%) - 42,179,278 instructions:u # 0.80 insn per cycle - # 0.01 stalled cycles per insn (64.63%) - 0.022892465 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.994720e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007761e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007761e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.353332 sec + 6,524,875,303 cycles # 2.768 GHz + 20,114,868,262 instructions # 3.08 insn per cycle + 2.358279130 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted - 56,394,214 cycles:u # 2.623 GHz (62.82%) - 44,678 stalled-cycles-frontend:u # 0.08% frontend cycles idle (62.82%) - 611,735 stalled-cycles-backend:u # 1.08% backend cycles idle (62.82%) - 42,481,271 instructions:u # 0.75 insn per cycle - # 0.01 stalled cycles per insn (58.94%) - 0.022737225 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.578556e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.585147e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.585147e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.047733 sec + 2,820,818,870 cycles # 2.682 GHz + 7,037,506,961 instructions # 2.49 insn per cycle + 1.053002937 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.765542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.773827e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.773827e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.937560 sec + 2,478,872,591 cycles # 2.633 GHz + 6,279,446,291 instructions # 2.53 insn per cycle + 0.942558881 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.394421e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.399630e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.399630e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.184942 sec + 2,037,351,256 cycles # 1.714 GHz + 3,247,924,134 instructions # 1.59 insn per cycle + 1.189828303 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 98d2ae55ac..d4941d3986 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,181 +1,226 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:48:36 +DATE: 2024-01-30_05:55:07 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.524884e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.683094e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.683954e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202335e-01 +- 3.251521e-01 ) GeV^-4 -TOTAL : 0.439268 sec - 1,258,352,153 cycles:u # 2.692 GHz (75.14%) - 3,340,197 stalled-cycles-frontend:u # 0.27% frontend cycles idle (74.82%) - 33,637,961 stalled-cycles-backend:u # 2.67% backend cycles idle (74.32%) - 1,655,103,779 instructions:u # 1.32 insn per cycle - # 0.02 stalled cycles per insn (74.30%) - 0.482678047 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.730552e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.395791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401561e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.472770 sec + 1,942,039,449 cycles # 2.839 GHz + 2,914,569,721 instructions # 1.50 insn per cycle + 0.744083634 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+MESDEV/none+NAVBRK +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.282605e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.707454e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.707873e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213799e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.418285 sec - 11,531,026,630 cycles:u # 3.339 GHz (74.90%) - 38,956,621 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.92%) - 1,147,015,037 stalled-cycles-backend:u # 9.95% backend cycles idle (74.98%) - 9,884,251,524 instructions:u # 0.86 insn per cycle - # 0.12 stalled cycles per insn (74.98%) - 3.471091734 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.426812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.621213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.624728e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.841459 sec + 5,951,272,102 cycles # 2.874 GHz + 12,317,260,326 instructions # 2.07 insn per cycle + 2.133121829 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.454097e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.455129e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.455129e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.688804 sec - 23,511,496,407 cycles:u # 3.504 GHz (74.97%) - 1,304,107 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.97%) - 2,796,839,675 stalled-cycles-backend:u # 11.90% backend cycles idle (74.97%) - 75,915,720,981 instructions:u # 3.23 insn per cycle - # 0.04 stalled cycles per insn (74.93%) - 6.712942623 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.960613e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961562e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961562e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.374119 sec + 24,216,955,817 cycles # 2.891 GHz + 75,878,033,044 instructions # 3.13 insn per cycle + 8.378947710 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866115424713E-004 -Relative difference = 5.861309557415831e-08 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.876098e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.893495e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.893495e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.666665 sec - 5,888,181,296 cycles:u # 3.487 GHz (74.89%) - 699,454 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.89%) - 877,010,524 stalled-cycles-backend:u # 14.89% backend cycles idle (74.89%) - 20,135,030,726 instructions:u # 3.42 insn per cycle - # 0.04 stalled cycles per insn (74.90%) - 1.690484284 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.136107e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.149132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.149132e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.306790 sec + 6,504,696,579 cycles # 2.815 GHz + 20,114,676,918 instructions # 3.09 insn per cycle + 2.311724672 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.356734e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.366903e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.366903e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.702011 sec - 2,503,642,455 cycles:u # 3.458 GHz (74.59%) - 524,618 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.59%) - 242,217,672 stalled-cycles-backend:u # 9.67% backend cycles idle (74.59%) - 7,096,971,568 instructions:u # 2.83 insn per cycle - # 0.03 stalled cycles per insn (74.75%) - 0.725726716 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11586) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.585387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592052e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.043340 sec + 2,821,286,489 cycles # 2.694 GHz + 7,037,435,358 instructions # 2.49 insn per cycle + 1.048505999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.743919e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.751789e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.751789e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.949190 sec + 2,568,265,414 cycles # 2.694 GHz + 6,279,620,229 instructions # 2.45 insn per cycle + 0.954345697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.404393e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.409463e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409463e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.176805 sec + 2,037,562,738 cycles # 1.726 GHz + 3,247,895,210 instructions # 1.59 insn per cycle + 1.182054069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index a8df518f17..391ab3d24f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_13:57:12 +DATE: 2024-01-30_05:02:36 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.482914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.674266e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.675640e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.441735 sec - 1,257,000,745 cycles:u # 2.694 GHz (74.18%) - 2,938,235 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.26%) - 34,932,732 stalled-cycles-backend:u # 2.78% backend cycles idle (74.36%) - 1,595,954,562 instructions:u # 1.27 insn per cycle - # 0.02 stalled cycles per insn (75.81%) - 0.487039944 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.280133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.331305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.337921e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.487324 sec + 1,984,658,948 cycles # 2.819 GHz + 2,919,152,547 instructions # 1.47 insn per cycle + 0.799038148 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.703269e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.734220e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.734654e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.331567 sec - 11,165,331,380 cycles:u # 3.340 GHz (74.67%) - 27,887,783 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.78%) - 1,142,800,192 stalled-cycles-backend:u # 10.24% backend cycles idle (75.09%) - 9,036,921,384 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.12%) - 3.382507531 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 8.572518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.647175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.650566e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.731714 sec + 5,664,416,860 cycles # 2.869 GHz + 11,423,818,247 instructions # 2.02 insn per cycle + 2.033192051 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.449944e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.450980e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.450980e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.699894 sec - 23,548,513,811 cycles:u # 3.503 GHz (75.01%) - 1,267,687 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.01%) - 2,567,037,364 stalled-cycles-backend:u # 10.90% backend cycles idle (75.01%) - 75,796,936,744 instructions:u # 3.22 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 6.724269019 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.928583e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929543e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929543e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.512607 sec + 24,191,141,745 cycles # 2.843 GHz + 75,807,282,467 instructions # 3.13 insn per cycle + 8.524714483 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866108667618E-004 -Relative difference = 5.871505118544242e-08 +Avg ME (F77/C++) = 6.6274870430095556E-004 +Relative difference = 6.489572191632735e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 9.895398e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.912823e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.912823e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 1.663257 sec - 5,880,803,726 cycles:u # 3.489 GHz (74.90%) - 703,258 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.85%) - 835,914,961 stalled-cycles-backend:u # 14.21% backend cycles idle (74.85%) - 20,173,522,806 instructions:u # 3.43 insn per cycle - # 0.04 stalled cycles per insn (74.71%) - 1.688905666 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.113368e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.126874e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126874e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.313934 sec + 6,500,918,155 cycles # 2.804 GHz + 20,111,364,543 instructions # 3.09 insn per cycle + 2.332783497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627485e-04 -Avg ME (F77/C++) = 6.6274845946848876E-004 -Relative difference = 6.115670001294808e-08 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.350572e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.360751e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.360751e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214980e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.703636 sec - 2,513,257,412 cycles:u # 3.463 GHz (74.80%) - 1,099,873 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.69%) - 295,163,738 stalled-cycles-backend:u # 11.74% backend cycles idle (74.65%) - 7,089,697,038 instructions:u # 2.82 insn per cycle - # 0.04 stalled cycles per insn (74.31%) - 0.729216779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11569) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.589760e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.596530e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.596530e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.040223 sec + 2,815,442,217 cycles # 2.695 GHz + 7,038,519,370 instructions # 2.50 insn per cycle + 1.057514134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271947045332125E-004 -Relative difference = 4.4583988847766445e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.751311e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.759469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.759469e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.945030 sec + 2,478,506,957 cycles # 2.610 GHz + 6,280,796,881 instructions # 2.53 insn per cycle + 0.988336476 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.386273e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.391271e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.391271e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.191730 sec + 2,039,311,665 cycles # 1.704 GHz + 3,248,072,614 instructions # 1.59 insn per cycle + 1.208300824 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627195e-04 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index c6f116f62e..77eae3ae9c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:24:42 +DATE: 2024-01-30_05:40:00 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.493346e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.687946e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.440319 sec - 1,273,508,778 cycles:u # 2.739 GHz (73.60%) - 2,778,353 stalled-cycles-frontend:u # 0.22% frontend cycles idle (74.20%) - 35,103,722 stalled-cycles-backend:u # 2.76% backend cycles idle (75.55%) - 1,611,965,427 instructions:u # 1.27 insn per cycle - # 0.02 stalled cycles per insn (75.74%) - 0.486403940 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.547321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593359e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.493771 sec + 2,067,778,848 cycles # 2.808 GHz + 3,079,367,454 instructions # 1.49 insn per cycle + 0.793803934 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.682520e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.715611e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.716036e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.338845 sec - 11,168,920,412 cycles:u # 3.333 GHz (74.87%) - 27,880,455 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.95%) - 1,145,991,851 stalled-cycles-backend:u # 10.26% backend cycles idle (74.94%) - 9,103,853,238 instructions:u # 0.82 insn per cycle - # 0.13 stalled cycles per insn (74.93%) - 3.392372577 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.730139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.790957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.793762e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.861027 sec + 6,035,258,548 cycles # 2.873 GHz + 13,088,532,370 instructions # 2.17 insn per cycle + 2.157681364 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 6.267558e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.268235e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.268235e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 26.172768 sec - 91,809,761,052 cycles:u # 3.505 GHz (74.99%) - 520,748,208 stalled-cycles-frontend:u # 0.57% frontend cycles idle (74.99%) - 7,042,480,285 stalled-cycles-backend:u # 7.67% backend cycles idle (74.99%) - 134,080,128,680 instructions:u # 1.46 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 26.197030587 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16252) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.418156e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.418889e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.418889e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 30.278912 sec + 87,193,893,967 cycles # 2.880 GHz + 133,999,567,781 instructions # 1.54 insn per cycle + 30.284052553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340697351248E-004 -Relative difference = 1.052203199451665e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275354356437610E-004 +Relative difference = 6.573239683366044e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.369298e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.382288e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.382288e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.965775 sec - 6,952,016,013 cycles:u # 3.497 GHz (74.84%) - 661,883 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 3,318,805,188 stalled-cycles-backend:u # 47.74% backend cycles idle (75.05%) - 19,180,596,065 instructions:u # 2.76 insn per cycle - # 0.17 stalled cycles per insn (75.05%) - 1.991242612 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.858617e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.871131e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.871131e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.400232 sec + 6,719,203,240 cycles # 2.795 GHz + 19,163,412,782 instructions # 2.85 insn per cycle + 2.405407499 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857053714997E-004 -Relative difference = 4.445554471174176e-08 +Avg ME (F77/C++) = 6.6274859783433532E-004 +Relative difference = 3.2677016209485094e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.458741e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.462654e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.462654e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.130561 sec - 4,016,590,780 cycles:u # 3.483 GHz (74.97%) - 536,332 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.03%) - 2,228,245,775 stalled-cycles-backend:u # 55.48% backend cycles idle (75.03%) - 6,764,434,762 instructions:u # 1.68 insn per cycle - # 0.33 stalled cycles per insn (75.03%) - 1.157743533 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48607) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.418642e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423893e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423893e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.168526 sec + 3,140,858,608 cycles # 2.683 GHz + 6,747,205,943 instructions # 2.15 insn per cycle + 1.173847287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735722101156E-004 -Relative difference = 6.454990161554483e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.703185e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.710717e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.710717e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.972109 sec + 2,610,520,883 cycles # 2.675 GHz + 5,931,408,487 instructions # 2.27 insn per cycle + 0.977161465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.380375e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.385342e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.385342e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.197476 sec + 2,050,152,648 cycles # 1.706 GHz + 3,435,996,672 instructions # 1.68 insn per cycle + 1.202741015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272748295826550E-004 +Relative difference = 2.5714542480216212e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index e09e89969a..0e738d355a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_14:25:31 +DATE: 2024-01-30_05:40:55 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.469687e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.672014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.672626e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.202247e-01 +- 3.251485e-01 ) GeV^-4 -TOTAL : 0.443251 sec - 1,282,818,741 cycles:u # 2.745 GHz (73.89%) - 2,895,680 stalled-cycles-frontend:u # 0.23% frontend cycles idle (74.09%) - 33,452,918 stalled-cycles-backend:u # 2.61% backend cycles idle (74.34%) - 1,622,110,235 instructions:u # 1.26 insn per cycle - # 0.02 stalled cycles per insn (75.15%) - 0.490883722 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 5.495403e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.535166e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.540654e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.492553 sec + 2,044,553,803 cycles # 2.834 GHz + 3,023,997,415 instructions # 1.48 insn per cycle + 0.781011529 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.697685e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.729954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.730376e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.213664e+02 +- 1.195366e+02 ) GeV^-4 -TOTAL : 3.310115 sec - 11,125,820,088 cycles:u # 3.330 GHz (74.83%) - 28,002,737 stalled-cycles-frontend:u # 0.25% frontend cycles idle (74.87%) - 1,144,368,068 stalled-cycles-backend:u # 10.29% backend cycles idle (75.15%) - 8,957,167,166 instructions:u # 0.81 insn per cycle - # 0.13 stalled cycles per insn (75.24%) - 3.362032400 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 7.639095e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.697524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.700186e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.866900 sec + 6,069,227,607 cycles # 2.871 GHz + 11,631,061,560 instructions # 1.92 insn per cycle + 2.173672620 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 6.626791e-04 -Avg ME (F77/CUDA) = 6.6270899361878938E-004 -Relative difference = 4.511024836808726e-05 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 6.233355e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.234020e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.234020e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204931e-01 +- 3.252404e-01 ) GeV^-4 -TOTAL : 26.316160 sec - 92,312,051,453 cycles:u # 3.505 GHz (74.99%) - 451,235,615 stalled-cycles-frontend:u # 0.49% frontend cycles idle (75.00%) - 7,117,997,834 stalled-cycles-backend:u # 7.71% backend cycles idle (75.00%) - 134,002,336,312 instructions:u # 1.45 insn per cycle - # 0.05 stalled cycles per insn (75.00%) - 26.340478391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:16105) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.528053e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.528817e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.528817e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 29.676151 sec + 85,692,453,161 cycles # 2.888 GHz + 134,120,579,675 instructions # 1.57 insn per cycle + 29.681167734 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275346486299042E-004 -Relative difference = 5.301670926116898e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627536e-04 +Avg ME (F77/C++) = 6.6275357377482830E-004 +Relative difference = 3.95700176737784e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 8.419571e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.432174e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.432174e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.953922 sec - 6,896,140,793 cycles:u # 3.489 GHz (74.95%) - 725,375 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.91%) - 3,060,527,991 stalled-cycles-backend:u # 44.38% backend cycles idle (74.91%) - 19,245,542,364 instructions:u # 2.79 insn per cycle - # 0.16 stalled cycles per insn (74.91%) - 1.979830500 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.924333e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.936823e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.936823e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.377362 sec + 6,721,293,685 cycles # 2.823 GHz + 19,223,635,236 instructions # 2.86 insn per cycle + 2.382317911 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274857044990032E-004 -Relative difference = 4.4587192899226015e-08 +Avg ME (F77/C++) = 6.6274859765498573E-004 +Relative difference = 3.538316437387639e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.500433e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.504571e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.504571e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 1.099115 sec - 3,910,134,327 cycles:u # 3.486 GHz (74.91%) - 558,653 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.04%) - 2,182,277,989 stalled-cycles-backend:u # 55.81% backend cycles idle (75.04%) - 6,705,411,876 instructions:u # 1.71 insn per cycle - # 0.33 stalled cycles per insn (75.04%) - 1.124840402 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47398) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.449646e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.455242e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.455242e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.140025 sec + 3,079,658,771 cycles # 2.692 GHz + 6,686,222,708 instructions # 2.17 insn per cycle + 1.145080651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627274e-04 -Avg ME (F77/C++) = 6.6272735755491807E-004 -Relative difference = 6.404606472340801e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.717993e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.725785e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.725785e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.963197 sec + 2,607,305,399 cycles # 2.696 GHz + 5,935,632,787 instructions # 2.28 insn per cycle + 0.968307475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.382587e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387561e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387561e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.195178 sec + 2,050,651,524 cycles # 1.710 GHz + 3,422,960,187 instructions # 1.67 insn per cycle + 1.200266882 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627275e-04 +Avg ME (F77/C++) = 6.6272749650985591E-004 +Relative difference = 5.26633351741962e-09 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index d26c28a736..7714401e20 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_13:57:39 +DATE: 2024-01-30_05:03:07 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.354215e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.516811e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.517858e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.655808 sec - 1,958,889,401 cycles:u # 2.870 GHz (75.38%) - 2,412,382 stalled-cycles-frontend:u # 0.12% frontend cycles idle (75.40%) - 37,254,762 stalled-cycles-backend:u # 1.90% backend cycles idle (75.03%) - 2,178,606,509 instructions:u # 1.11 insn per cycle - # 0.02 stalled cycles per insn (75.16%) - 0.708127648 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.456900e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.484722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487399e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.528831 sec + 2,192,645,567 cycles # 2.833 GHz + 3,378,106,633 instructions # 1.54 insn per cycle + 0.861052908 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.238397e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.241190e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.241245e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 8.397194 sec - 28,888,354,559 cycles:u # 3.424 GHz (75.01%) - 11,624,614 stalled-cycles-frontend:u # 0.04% frontend cycles idle (75.01%) - 1,121,527,055 stalled-cycles-backend:u # 3.88% backend cycles idle (75.02%) - 22,649,300,700 instructions:u # 0.78 insn per cycle - # 0.05 stalled cycles per insn (75.06%) - 8.457715515 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.113905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.147620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.149017e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.049267 sec + 9,507,642,735 cycles # 2.871 GHz + 19,066,132,971 instructions # 2.01 insn per cycle + 3.371164553 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.162585e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.163416e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.163416e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.591256 sec - 26,650,428,737 cycles:u # 3.501 GHz (74.99%) - 44,584,150 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.99%) - 3,994,903,658 stalled-cycles-backend:u # 14.99% backend cycles idle (74.99%) - 82,451,104,184 instructions:u # 3.09 insn per cycle - # 0.05 stalled cycles per insn (74.99%) - 7.615919588 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.769606e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.770411e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.770411e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.278224 sec + 26,812,823,901 cycles # 2.889 GHz + 82,462,709,559 instructions # 3.08 insn per cycle + 9.289930135 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.081118e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.085777e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.085777e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.235482 sec - 11,379,481,573 cycles:u # 3.493 GHz (74.96%) - 3,503,367 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.96%) - 1,193,771,329 stalled-cycles-backend:u # 10.49% backend cycles idle (74.96%) - 38,541,307,073 instructions:u # 3.39 insn per cycle - # 0.03 stalled cycles per insn (74.96%) - 3.261507264 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.509625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512894e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.686363 sec + 12,638,766,565 cycles # 2.696 GHz + 38,538,047,706 instructions # 3.05 insn per cycle + 4.708715306 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.217108e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.219788e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.219788e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.355326 sec - 4,803,691,181 cycles:u # 3.486 GHz (74.91%) - 846,490 stalled-cycles-frontend:u # 0.02% frontend cycles idle (75.04%) - 519,948,195 stalled-cycles-backend:u # 10.82% backend cycles idle (75.04%) - 13,592,579,448 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 1.381084672 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.005037e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.021640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.021640e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.058850 sec + 5,538,789,085 cycles # 2.684 GHz + 13,583,257,196 instructions # 2.45 insn per cycle + 2.079297542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.175649e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.196938e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.196938e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.797590 sec + 4,843,535,516 cycles # 2.687 GHz + 12,110,039,110 instructions # 2.50 insn per cycle + 1.813279758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.862805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.874864e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.874864e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.399875 sec + 4,096,013,404 cycles # 1.704 GHz + 6,283,624,620 instructions # 1.53 insn per cycle + 2.418716991 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 7f51395c68..9cdb5ea5b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-01-31_13:58:15 +DATE: 2024-01-30_05:03:46 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.358889e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.423680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.423885e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.535324 sec - 1,548,672,373 cycles:u # 2.766 GHz (75.36%) - 2,323,136 stalled-cycles-frontend:u # 0.15% frontend cycles idle (75.39%) - 50,258,647 stalled-cycles-backend:u # 3.25% backend cycles idle (75.85%) - 1,831,558,733 instructions:u # 1.18 insn per cycle - # 0.03 stalled cycles per insn (75.10%) - 0.583324308 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 3.463401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.491582e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494105e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.528345 sec + 2,191,366,155 cycles # 2.835 GHz + 3,376,981,873 instructions # 1.54 insn per cycle + 0.868249282 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.738857e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.744595e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.744713e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 7.034363 sec - 24,120,218,080 cycles:u # 3.411 GHz (74.97%) - 11,414,117 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.01%) - 1,120,074,241 stalled-cycles-backend:u # 4.64% backend cycles idle (75.00%) - 18,957,844,450 instructions:u # 0.79 insn per cycle - # 0.06 stalled cycles per insn (75.04%) - 7.093773300 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.141311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175321e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176779e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.034775 sec + 9,461,569,572 cycles # 2.871 GHz + 21,570,730,622 instructions # 2.28 insn per cycle + 3.354365055 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 6.626675e-04 Avg ME (F77/CUDA) = 6.6266732376103494E-004 Relative difference = 2.659538381540814e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 2.200217e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.201081e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.201081e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 7.461298 sec - 26,216,002,251 cycles:u # 3.503 GHz (74.99%) - 8,570,456 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.99%) - 3,485,137,026 stalled-cycles-backend:u # 13.29% backend cycles idle (74.99%) - 82,338,842,956 instructions:u # 3.14 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 7.485833169 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.763986e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.764820e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.764820e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 9.308542 sec + 26,818,191,963 cycles # 2.880 GHz + 82,362,969,124 instructions # 3.07 insn per cycle + 9.331807277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.063768e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.068407e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.068407e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.246351 sec - 11,439,475,472 cycles:u # 3.499 GHz (74.89%) - 5,196,887 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.97%) - 1,364,315,247 stalled-cycles-backend:u # 11.93% backend cycles idle (75.04%) - 38,562,841,783 instructions:u # 3.37 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 3.272390742 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.494755e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497969e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497969e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.702744 sec + 12,651,856,685 cycles # 2.688 GHz + 38,557,643,348 instructions # 3.05 insn per cycle + 4.723006762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.213273e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.215906e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.215906e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.359387 sec - 4,814,914,164 cycles:u # 3.484 GHz (74.73%) - 701,661 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.84%) - 488,467,217 stalled-cycles-backend:u # 10.14% backend cycles idle (75.08%) - 13,606,707,744 instructions:u # 2.83 insn per cycle - # 0.04 stalled cycles per insn (75.11%) - 1.386481494 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10908) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.057026e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.073448e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.073448e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.045356 sec + 5,503,322,263 cycles # 2.685 GHz + 13,599,131,001 instructions # 2.47 insn per cycle + 2.065937163 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276836E-004 Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.173965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.195231e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.195231e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.797623 sec + 4,836,406,491 cycles # 2.684 GHz + 12,123,840,407 instructions # 2.51 insn per cycle + 1.816744592 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.872297e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.884618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.884618e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.396265 sec + 4,088,419,794 cycles # 1.703 GHz + 6,289,480,909 instructions # 1.54 insn per cycle + 2.414194012 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.626675e-04 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 362d9e06ac..10dc25694a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:00:16 +DATE: 2024-01-30_05:06:15 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.905901e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.911485e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.911576e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.522044 sec - 32,979,124,021 cycles:u # 3.453 GHz (74.97%) - 3,578,135 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,104,095 stalled-cycles-backend:u # 0.02% backend cycles idle (75.03%) - 26,013,812,715 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.574363198 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.064289e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.064686e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.064874e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.459219 sec + 7,914,579,350 cycles # 2.876 GHz + 17,414,362,649 instructions # 2.20 insn per cycle + 2.856648920 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.470769e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.474626e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.474648e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.160361 sec - 31,690,243,585 cycles:u # 3.450 GHz (74.99%) - 3,785,612 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 60,055,016 stalled-cycles-backend:u # 0.19% backend cycles idle (75.01%) - 25,048,526,645 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 9.207355038 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.261836e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.264181e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.264456e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.001930 sec + 12,466,660,301 cycles # 2.881 GHz + 28,598,806,424 instructions # 2.29 insn per cycle + 4.385332309 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.019899e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.019928e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.019928e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.178513 sec - 18,195,723,872 cycles:u # 3.499 GHz (74.93%) - 30,630,770 stalled-cycles-frontend:u # 0.17% frontend cycles idle (74.94%) - 2,065,218,530 stalled-cycles-backend:u # 11.35% backend cycles idle (74.99%) - 55,194,104,763 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 5.202702937 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.667093e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.667308e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.667308e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.887821 sec + 18,997,365,246 cycles # 2.759 GHz + 55,182,817,229 instructions # 2.90 insn per cycle + 6.894930966 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.232094e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.232222e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.232222e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.367099 sec - 8,357,225,941 cycles:u # 3.498 GHz (74.92%) - 1,713,232 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.89%) - 783,489,125 stalled-cycles-backend:u # 9.37% backend cycles idle (74.89%) - 27,125,618,956 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (74.81%) - 2.392556781 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.565125e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565211e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565211e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.384131 sec + 9,789,568,447 cycles # 2.893 GHz + 27,057,217,068 instructions # 2.76 insn per cycle + 3.398188002 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.162576e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.163242e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.163242e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.024351 sec - 3,648,814,216 cycles:u # 3.487 GHz (74.85%) - 1,279,142 stalled-cycles-frontend:u # 0.04% frontend cycles idle (74.77%) - 302,436,489 stalled-cycles-backend:u # 8.29% backend cycles idle (74.77%) - 9,598,531,237 instructions:u # 2.63 insn per cycle - # 0.03 stalled cycles per insn (74.77%) - 1.049480308 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.331784e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332213e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332213e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.593056 sec + 4,251,132,724 cycles # 2.667 GHz + 9,566,982,441 instructions # 2.25 insn per cycle + 1.603318722 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.782288e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.782847e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.782847e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.405349 sec + 3,719,980,949 cycles # 2.646 GHz + 8,451,730,597 instructions # 2.27 insn per cycle + 1.418908281 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.332107e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332611e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332611e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.593467 sec + 2,690,971,905 cycles # 1.687 GHz + 4,249,909,932 instructions # 1.58 insn per cycle + 1.609272621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 316f7c5721..14598d99fd 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:42:34 +DATE: 2024-01-30_05:50:15 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.903771e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.904490e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.904490e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.602069 sec - 33,267,707,940 cycles:u # 3.455 GHz (74.99%) - 3,635,257 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 7,352,981 stalled-cycles-backend:u # 0.02% backend cycles idle (75.01%) - 26,291,061,699 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 9.654521726 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.062580e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.063573e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063573e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.393250 sec + 7,805,787,223 cycles # 2.878 GHz + 17,759,546,689 instructions # 2.28 insn per cycle + 2.771767839 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:DBL+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.467512e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.471029e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.471029e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.169872 sec - 31,755,546,975 cycles:u # 3.453 GHz (74.97%) - 4,329,583 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 57,451,118 stalled-cycles-backend:u # 0.18% backend cycles idle (74.97%) - 25,082,541,299 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.00%) - 9.216880999 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.205412e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.241153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.241153e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.999133 sec + 12,487,046,648 cycles # 2.887 GHz + 29,181,392,973 instructions # 2.34 insn per cycle + 4.379902707 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.021926e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.021953e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.021953e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.168329 sec - 18,190,353,086 cycles:u # 3.505 GHz (74.91%) - 29,000,533 stalled-cycles-frontend:u # 0.16% frontend cycles idle (75.03%) - 2,110,974,888 stalled-cycles-backend:u # 11.60% backend cycles idle (75.03%) - 55,195,082,247 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (75.03%) - 5.192332090 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.924049e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.924280e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.924280e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.668799 sec + 18,978,883,548 cycles # 2.845 GHz + 55,181,310,686 instructions # 2.91 insn per cycle + 6.673958990 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.228820e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.228947e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.228947e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.370786 sec - 8,357,476,280 cycles:u # 3.492 GHz (74.93%) - 1,525,757 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) - 763,908,253 stalled-cycles-backend:u # 9.14% backend cycles idle (74.93%) - 27,111,418,735 instructions:u # 3.24 insn per cycle - # 0.03 stalled cycles per insn (74.84%) - 2.396266638 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.558442e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558530e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558530e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.394020 sec + 9,815,752,501 cycles # 2.889 GHz + 27,056,612,659 instructions # 2.76 insn per cycle + 3.399148950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.231422e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.232115e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.232115e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.011154 sec - 3,602,330,298 cycles:u # 3.485 GHz (74.57%) - 908,225 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.66%) - 243,683,993 stalled-cycles-backend:u # 6.76% backend cycles idle (75.01%) - 9,599,377,384 instructions:u # 2.66 insn per cycle - # 0.03 stalled cycles per insn (75.23%) - 1.036728920 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84261) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.345002e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345461e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345461e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.587609 sec + 4,248,692,453 cycles # 2.674 GHz + 9,567,437,136 instructions # 2.25 insn per cycle + 1.592590793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.873515e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874138e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874138e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.369431 sec + 3,692,449,005 cycles # 2.689 GHz + 8,450,968,058 instructions # 2.29 insn per cycle + 1.374284426 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.369341e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.369854e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.369854e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.574041 sec + 2,686,211,452 cycles # 1.702 GHz + 4,249,274,815 instructions # 1.58 insn per cycle + 1.579137128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 1d53f5cee3..869fccfa2f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:01:57 +DATE: 2024-01-30_05:07:21 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.848562e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.854180e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.854240e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.652377 sec - 33,505,735,625 cycles:u # 3.461 GHz (74.93%) - 3,572,159 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 8,527,968 stalled-cycles-backend:u # 0.03% backend cycles idle (75.01%) - 26,418,167,401 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.02%) - 9.700899936 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.062893e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.063296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063519e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.463696 sec + 7,904,124,830 cycles # 2.867 GHz + 17,962,469,169 instructions # 2.27 insn per cycle + 2.863348110 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:DBL+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.481649e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485248e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.485281e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.149586 sec - 31,710,895,432 cycles:u # 3.455 GHz (74.97%) - 4,043,797 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 45,699,251 stalled-cycles-backend:u # 0.14% backend cycles idle (75.00%) - 25,027,683,835 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.04%) - 9.197900477 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 9.275434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.277655e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.278066e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.004406 sec + 12,472,043,203 cycles # 2.872 GHz + 27,476,431,943 instructions # 2.20 insn per cycle + 4.397866058 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722595284406710E-003 -Relative difference = 3.516477760164775e-07 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.024961e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.024989e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.024989e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.153075 sec - 18,122,203,390 cycles:u # 3.502 GHz (74.96%) - 29,439,456 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.96%) - 2,282,705,716 stalled-cycles-backend:u # 12.60% backend cycles idle (74.96%) - 55,175,170,632 instructions:u # 3.04 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 5.177202891 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.993181e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.993429e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.993429e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.616864 sec + 18,937,214,023 cycles # 2.863 GHz + 55,162,675,285 instructions # 2.91 insn per cycle + 6.624084944 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.235560e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.235688e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.235688e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.362983 sec - 8,327,226,063 cycles:u # 3.491 GHz (74.85%) - 2,207,034 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.87%) - 817,488,808 stalled-cycles-backend:u # 9.82% backend cycles idle (74.94%) - 27,104,075,495 instructions:u # 3.25 insn per cycle - # 0.03 stalled cycles per insn (75.09%) - 2.388617027 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.560244e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.560337e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.560337e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.391033 sec + 9,810,909,577 cycles # 2.891 GHz + 27,064,931,751 instructions # 2.76 insn per cycle + 3.404410372 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.166153e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.166819e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.166819e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.023339 sec - 3,644,451,901 cycles:u # 3.486 GHz (74.84%) - 997,870 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.75%) - 269,479,223 stalled-cycles-backend:u # 7.39% backend cycles idle (74.75%) - 9,606,641,224 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.79%) - 1.048702145 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84231) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.366743e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367151e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367151e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.577213 sec + 4,241,194,499 cycles # 2.687 GHz + 9,570,392,055 instructions # 2.26 insn per cycle + 1.590680511 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.823083e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.823621e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823621e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.389663 sec + 3,742,544,913 cycles # 2.690 GHz + 8,455,558,047 instructions # 2.26 insn per cycle + 1.401942381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.367545e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.368096e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.368096e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.578445 sec + 2,686,793,480 cycles # 1.702 GHz + 4,251,847,609 instructions # 1.58 insn per cycle + 1.591347897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722595285411531E-003 +Relative difference = 3.516375977906115e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bfc75014f0..a75bd83e48 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:03:39 +DATE: 2024-01-30_05:08:27 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.756123e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.759235e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.759270e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 -TOTAL : 4.678163 sec - 16,049,952,313 cycles:u # 3.411 GHz (74.99%) - 2,790,953 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.99%) - 6,464,230 stalled-cycles-backend:u # 0.04% backend cycles idle (74.90%) - 13,048,590,801 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.89%) - 4.728278426 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.769847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.770754e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.771164e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.703094 sec + 5,571,181,653 cycles # 2.867 GHz + 11,974,166,174 instructions # 2.15 insn per cycle + 2.057946232 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.158642e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.173754e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.173876e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 -TOTAL : 4.796927 sec - 16,485,997,216 cycles:u # 3.418 GHz (74.95%) - 3,094,127 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.95%) - 52,025,079 stalled-cycles-backend:u # 0.32% backend cycles idle (74.85%) - 13,325,972,509 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.91%) - 4.844549271 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.318486e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319261e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319430e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.904733 sec + 6,266,697,659 cycles # 2.868 GHz + 13,596,680,456 instructions # 2.17 insn per cycle + 2.241129899 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.095569e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.095603e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.095603e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.820874 sec - 16,955,395,720 cycles:u # 3.501 GHz (74.90%) - 14,012,697 stalled-cycles-frontend:u # 0.08% frontend cycles idle (74.92%) - 1,919,969,828 stalled-cycles-backend:u # 11.32% backend cycles idle (75.00%) - 51,814,509,039 instructions:u # 3.06 insn per cycle - # 0.04 stalled cycles per insn (75.06%) - 4.845302656 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.651013e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651286e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651286e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.115216 sec + 17,580,950,028 cycles # 2.876 GHz + 51,788,424,956 instructions # 2.95 insn per cycle + 6.122234952 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.577122e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.577657e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.577657e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.155184 sec - 4,095,286,029 cycles:u # 3.479 GHz (74.86%) - 845,198 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.86%) - 406,702,774 stalled-cycles-backend:u # 9.93% backend cycles idle (74.86%) - 13,777,565,579 instructions:u # 3.36 insn per cycle - # 0.03 stalled cycles per insn (74.89%) - 1.180459682 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.365857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366295e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366295e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.576617 sec + 4,544,162,423 cycles # 2.878 GHz + 13,760,085,205 instructions # 3.03 insn per cycle + 1.587566374 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.020655e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.020912e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.020912e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.518680 sec - 1,875,512,901 cycles:u # 3.467 GHz (75.01%) - 1,293,148 stalled-cycles-frontend:u # 0.07% frontend cycles idle (74.87%) - 191,096,015 stalled-cycles-backend:u # 10.19% backend cycles idle (74.87%) - 4,860,923,155 instructions:u # 2.59 insn per cycle - # 0.04 stalled cycles per insn (74.87%) - 0.543969116 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.652038e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.653755e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.653755e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.803941 sec + 2,147,173,176 cycles # 2.667 GHz + 4,827,637,015 instructions # 2.25 insn per cycle + 0.818354401 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.264093e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.266084e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.266084e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.735161 sec + 1,890,652,826 cycles # 2.565 GHz + 4,260,215,320 instructions # 2.25 insn per cycle + 0.752160652 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.595587e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.597618e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.597618e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.809620 sec + 1,357,631,253 cycles # 1.673 GHz + 2,149,171,041 instructions # 1.58 insn per cycle + 0.843747051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 2e35431afe..dd846fe890 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,190 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:44:15 +DATE: 2024-01-30_05:51:20 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.717389e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.717731e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.717731e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.935145e-03 +- 4.929588e-03 ) GeV^-6 -TOTAL : 4.722233 sec - 16,232,287,395 cycles:u # 3.418 GHz (74.90%) - 2,799,618 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.96%) - 7,657,548 stalled-cycles-backend:u # 0.05% backend cycles idle (75.03%) - 13,101,972,238 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (75.07%) - 4.773465845 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.783457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785575e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785575e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.618103 sec + 5,426,392,715 cycles # 2.867 GHz + 11,041,442,286 instructions # 2.03 insn per cycle + 1.951599799 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:FLT+CXS:COMMON+RMBHST+BRDDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.157734e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.172831e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.172831e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.258769e+00 +- 1.256832e+00 ) GeV^-6 -TOTAL : 4.797989 sec - 16,471,525,685 cycles:u # 3.415 GHz (74.93%) - 3,709,532 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.92%) - 51,132,370 stalled-cycles-backend:u # 0.31% backend cycles idle (74.87%) - 13,336,883,973 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.88%) - 4.844853430 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.306053e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319762e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319762e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.925915 sec + 6,319,647,912 cycles # 2.872 GHz + 13,785,417,374 instructions # 2.18 insn per cycle + 2.259607907 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.092875e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092906e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092906e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.833035 sec - 17,000,113,882 cycles:u # 3.502 GHz (74.96%) - 14,451,376 stalled-cycles-frontend:u # 0.09% frontend cycles idle (74.96%) - 1,952,835,950 stalled-cycles-backend:u # 11.49% backend cycles idle (74.96%) - 51,813,611,140 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (74.96%) - 4.857155972 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.635189e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.635467e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635467e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.119375 sec + 17,637,027,949 cycles # 2.881 GHz + 51,787,792,256 instructions # 2.94 insn per cycle + 6.124243714 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087414119E-003 -Relative difference = 2.1196409216982896e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.562792e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.563331e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.563331e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.158831 sec - 4,108,365,889 cycles:u # 3.479 GHz (74.94%) - 1,184,231 stalled-cycles-frontend:u # 0.03% frontend cycles idle (74.94%) - 419,202,788 stalled-cycles-backend:u # 10.20% backend cycles idle (74.94%) - 13,778,170,314 instructions:u # 3.35 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 1.184304500 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.362357e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.362789e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.362789e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.576085 sec + 4,544,551,937 cycles # 2.877 GHz + 13,759,350,934 instructions # 3.03 insn per cycle + 1.581388093 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.038104e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038375e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.038375e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.510431 sec - 1,846,138,323 cycles:u # 3.465 GHz (74.63%) - 427,215 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.48%) - 151,340,586 stalled-cycles-backend:u # 8.20% backend cycles idle (74.48%) - 4,873,469,205 instructions:u # 2.64 insn per cycle - # 0.03 stalled cycles per insn (74.59%) - 0.535898006 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84813) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.701025e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.702845e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.702845e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.794089 sec + 2,138,661,629 cycles # 2.680 GHz + 4,826,930,405 instructions # 2.26 insn per cycle + 0.798991405 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.613418e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.615510e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.615510e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.699716 sec + 1,882,009,512 cycles # 2.675 GHz + 4,259,439,384 instructions # 2.26 insn per cycle + 0.704552121 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=256) +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.688489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.690546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.690546e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.795848 sec + 1,355,819,871 cycles # 1.696 GHz + 2,148,215,879 instructions # 1.58 insn per cycle + 0.800761416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 5518cc9752..90b9187b98 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:04:43 +DATE: 2024-01-30_05:09:16 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.778497e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781890e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.781921e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.927921e-03 +- 4.922372e-03 ) GeV^-6 -TOTAL : 4.628898 sec - 15,907,548,391 cycles:u # 3.417 GHz (74.91%) - 2,873,282 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.86%) - 7,959,031 stalled-cycles-backend:u # 0.05% backend cycles idle (74.88%) - 12,933,345,737 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.95%) - 4.679899063 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 6.764318e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.765250e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.765666e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.705167 sec + 5,556,067,435 cycles # 2.852 GHz + 10,985,634,618 instructions # 1.98 insn per cycle + 2.060310498 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:FLT+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.132093e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.150020e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.150141e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.216523e+00 +- 1.214588e+00 ) GeV^-6 -TOTAL : 4.798027 sec - 16,471,697,372 cycles:u # 3.415 GHz (75.01%) - 3,060,544 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.93%) - 46,584,268 stalled-cycles-backend:u # 0.28% backend cycles idle (74.95%) - 13,318,650,654 instructions:u # 0.81 insn per cycle - # 0.00 stalled cycles per insn (74.94%) - 4.845336497 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 2.344230e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345038e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345205e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.928465 sec + 6,365,812,176 cycles # 2.870 GHz + 12,742,048,160 instructions # 2.00 insn per cycle + 2.275067290 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Avg ME (C++/CUDA) = 9.855155e-03 -Avg ME (F77/CUDA) = 9.8696023209835834E-003 -Relative difference = 0.0014659658811639687 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.091904e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091934e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091934e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 4.836743 sec - 16,996,773,057 cycles:u # 3.498 GHz (74.97%) - 17,073,316 stalled-cycles-frontend:u # 0.10% frontend cycles idle (74.97%) - 1,656,388,245 stalled-cycles-backend:u # 9.75% backend cycles idle (74.97%) - 51,786,416,825 instructions:u # 3.05 insn per cycle - # 0.03 stalled cycles per insn (75.00%) - 4.860981674 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.700294e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.700564e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700564e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 6.077926 sec + 17,558,502,709 cycles # 2.889 GHz + 51,759,109,121 instructions # 2.95 insn per cycle + 6.085026833 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087396841E-003 -Relative difference = 2.119623377106246e-08 +Avg ME (F77/C++) = 9.8479612087313262E-003 +Relative difference = 2.1195385077844924e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 4.559383e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.559952e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.559952e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924322e-03 +- 4.918776e-03 ) GeV^-6 -TOTAL : 1.159399 sec - 4,119,848,450 cycles:u # 3.487 GHz (75.03%) - 611,148 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.95%) - 401,429,588 stalled-cycles-backend:u # 9.74% backend cycles idle (74.95%) - 13,796,637,357 instructions:u # 3.35 insn per cycle - # 0.03 stalled cycles per insn (74.95%) - 1.184833688 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.376771e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.377174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.377174e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.572289 sec + 4,548,603,521 cycles # 2.891 GHz + 13,758,604,883 instructions # 3.02 insn per cycle + 1.583710945 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.847957e-03 -Avg ME (F77/C++) = 9.8479574833965355E-003 -Relative difference = 4.9085971470122835e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.847955e-03 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 1.020958e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.021239e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.021239e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.518489 sec - 1,875,147,196 cycles:u # 3.469 GHz (75.01%) - 964,959 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.85%) - 170,331,058 stalled-cycles-backend:u # 9.08% backend cycles idle (74.85%) - 4,863,058,667 instructions:u # 2.59 insn per cycle - # 0.04 stalled cycles per insn (74.85%) - 0.543762355 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84775) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.592179e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.593820e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.593820e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.808909 sec + 2,140,416,404 cycles # 2.637 GHz + 4,826,824,873 instructions # 2.26 insn per cycle + 0.906681144 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892973e-03 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 +OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.677326e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.679741e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.679741e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.695609 sec + 1,868,752,206 cycles # 2.678 GHz + 4,259,067,854 instructions # 2.28 insn per cycle + 0.708960929 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161012351E-003 -Relative difference = 1.8588827066662492e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.775075e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.777182e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.777182e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.788693 sec + 1,354,650,321 cycles # 1.715 GHz + 2,148,091,187 instructions # 1.59 insn per cycle + 0.801177717 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 5178c8cf68..4eda45e114 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:05:47 +DATE: 2024-01-30_05:10:05 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.387501e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.392699e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.392765e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 10.047124 sec - 34,815,668,507 cycles:u # 3.456 GHz (75.00%) - 3,625,381 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.94%) - 8,034,622 stalled-cycles-backend:u # 0.02% backend cycles idle (74.94%) - 27,505,142,119 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.01%) - 10.097930678 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.692959e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.693612e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.693848e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.179406 sec + 7,155,207,889 cycles # 2.861 GHz + 14,615,335,571 instructions # 2.04 insn per cycle + 2.559881855 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.191872e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.195128e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.195154e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.583926 sec - 33,210,848,611 cycles:u # 3.456 GHz (74.96%) - 3,988,741 stalled-cycles-frontend:u # 0.01% frontend cycles idle (75.00%) - 56,320,623 stalled-cycles-backend:u # 0.17% backend cycles idle (75.03%) - 26,173,670,767 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.03%) - 9.633227442 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.111470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111782e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111825e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.413893 sec + 10,746,707,284 cycles # 2.875 GHz + 23,674,149,917 instructions # 2.20 insn per cycle + 3.796927749 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.017456e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.017484e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.017484e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.191105 sec - 18,261,367,497 cycles:u # 3.503 GHz (74.99%) - 32,360,708 stalled-cycles-frontend:u # 0.18% frontend cycles idle (74.99%) - 2,211,411,467 stalled-cycles-backend:u # 12.11% backend cycles idle (74.99%) - 55,409,256,470 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (74.99%) - 5.215335296 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.884803e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.885022e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.885022e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.704107 sec + 19,257,123,030 cycles # 2.874 GHz + 55,394,447,460 instructions # 2.88 insn per cycle + 6.709385430 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.327899e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.328033e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.328033e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.269681 sec - 8,004,167,296 cycles:u # 3.492 GHz (74.87%) - 1,017,916 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.87%) - 775,823,367 stalled-cycles-backend:u # 9.69% backend cycles idle (74.90%) - 25,920,796,565 instructions:u # 3.24 insn per cycle - # 0.03 stalled cycles per insn (75.01%) - 2.295300917 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.509946e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.510039e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510039e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.502177 sec + 9,384,694,038 cycles # 2.677 GHz + 25,874,743,625 instructions # 2.76 insn per cycle + 3.507349921 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.411886e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.412622e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.412622e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.977034 sec - 3,480,538,606 cycles:u # 3.483 GHz (74.73%) - 1,928,349 stalled-cycles-frontend:u # 0.06% frontend cycles idle (75.07%) - 290,676,915 stalled-cycles-backend:u # 8.35% backend cycles idle (75.19%) - 9,134,232,930 instructions:u # 2.62 insn per cycle - # 0.03 stalled cycles per insn (75.19%) - 1.002459987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83802) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.557555e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558062e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.558062e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.490188 sec + 4,000,749,453 cycles # 2.678 GHz + 9,119,038,902 instructions # 2.28 insn per cycle + 1.495279789 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.057405e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058069e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058069e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.307627 sec + 3,513,640,690 cycles # 2.679 GHz + 8,029,011,845 instructions # 2.29 insn per cycle + 1.312711431 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.350506e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351010e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351010e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.581908 sec + 2,606,864,065 cycles # 1.673 GHz + 4,077,382,976 instructions # 1.56 insn per cycle + 1.587144818 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 6ac6df302f..328b61834e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,181 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-01-31_14:07:31 +DATE: 2024-01-30_05:11:08 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.497198e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.502430e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.502469e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 9.923800 sec - 34,389,805,654 cycles:u # 3.456 GHz (75.01%) - 3,707,069 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.99%) - 7,302,220 stalled-cycles-backend:u # 0.02% backend cycles idle (74.99%) - 27,148,100,293 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (74.99%) - 9.977924992 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 4.684370e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.684951e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.685153e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.181079 sec + 7,148,088,261 cycles # 2.853 GHz + 14,239,530,947 instructions # 1.99 insn per cycle + 2.562146879 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_HIP [clang 14.0.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = HIP:MIX+CXS:COMMON+RMBDEV+MESDEV/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.224267e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.227630e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.227658e+03 ) sec^-1 -MeanMatrixElemValue = ( 1.221264e+00 +- 1.219329e+00 ) GeV^-6 -TOTAL : 9.529095 sec - 33,040,071,235 cycles:u # 3.458 GHz (74.96%) - 3,926,744 stalled-cycles-frontend:u # 0.01% frontend cycles idle (74.98%) - 55,408,476 stalled-cycles-backend:u # 0.17% backend cycles idle (75.03%) - 26,027,529,168 instructions:u # 0.79 insn per cycle - # 0.00 stalled cycles per insn (75.05%) - 9.578892739 seconds time elapsed +EvtsPerSec[Rmb+ME] (23) = ( 1.111591e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111956e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.413150 sec + 10,755,861,454 cycles # 2.876 GHz + 23,518,245,564 instructions # 2.19 insn per cycle + 3.796500341 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 Avg ME (C++/CUDA) = 9.872263e-03 -Avg ME (F77/CUDA) = 9.8722599015656533E-003 -Relative difference = 3.138524921691728e-07 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.020515e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.020542e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.020542e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.175128 sec - 18,196,693,033 cycles:u # 3.502 GHz (74.91%) - 28,359,547 stalled-cycles-frontend:u # 0.16% frontend cycles idle (74.92%) - 2,265,467,080 stalled-cycles-backend:u # 12.45% backend cycles idle (74.97%) - 55,450,832,000 instructions:u # 3.05 insn per cycle - # 0.04 stalled cycles per insn (75.04%) - 5.199149057 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.912565e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.912803e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.912803e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.680088 sec + 19,228,329,737 cycles # 2.877 GHz + 55,419,296,273 instructions # 2.88 insn per cycle + 6.685533383 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 2.338198e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.338337e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.338337e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.259325 sec - 7,979,854,438 cycles:u # 3.497 GHz (74.80%) - 1,426,621 stalled-cycles-frontend:u # 0.02% frontend cycles idle (74.98%) - 851,798,980 stalled-cycles-backend:u # 10.67% backend cycles idle (75.11%) - 25,830,275,782 instructions:u # 3.24 insn per cycle - # 0.03 stalled cycles per insn (75.11%) - 2.285086237 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.515454e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.515537e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.515537e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.490021 sec + 9,348,051,078 cycles # 2.676 GHz + 25,823,110,897 instructions # 2.76 insn per cycle + 3.495053121 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -EvtsPerSec[Rmb+ME] (23) = ( 5.471821e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.472557e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.472557e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.966338 sec - 3,444,953,941 cycles:u # 3.485 GHz (75.03%) - 1,775,471 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.92%) - 287,668,953 stalled-cycles-backend:u # 8.35% backend cycles idle (74.92%) - 9,128,413,240 instructions:u # 2.65 insn per cycle - # 0.03 stalled cycles per insn (74.92%) - 0.991652980 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83360) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.556805e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557285e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557285e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.490221 sec + 4,003,060,439 cycles # 2.680 GHz + 9,098,942,911 instructions # 2.27 insn per cycle + 1.495311791 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.083203e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.083821e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.083821e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.299137 sec + 3,488,850,980 cycles # 2.678 GHz + 8,010,474,997 instructions # 2.30 insn per cycle + 1.304443015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) ------------------------------------------------------------------------- -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe is not supported (no avx512vl in /proc/cpuinfo) +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.440905e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.441442e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.441442e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.541232 sec + 2,598,862,718 cycles # 1.682 GHz + 4,064,975,706 instructions # 1.56 insn per cycle + 1.546247038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 9.872263e-03 +Avg ME (F77/C++) = 9.8722594324461913E-003 +Relative difference = 3.613714310412983e-07 +OK (relative difference <= 5E-3) ========================================================================= TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 69902e7516..5667ce458e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,108 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_13:58:50 +DATE: 2024-01-30_05:04:25 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.650880e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.304183e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.677107e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.453043 sec + 1,889,864,608 cycles # 2.824 GHz + 2,684,689,341 instructions # 1.42 insn per cycle + 0.749142975 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 762,819,983 cycles:u # 2.210 GHz (74.53%) - 2,579,368 stalled-cycles-frontend:u # 0.34% frontend cycles idle (75.81%) - 24,249,934 stalled-cycles-backend:u # 3.18% backend cycles idle (75.94%) - 1,261,592,345 instructions:u # 1.65 insn per cycle - # 0.02 stalled cycles per insn (74.30%) - 0.374903403 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Segmentation fault - 2,642,925,381 cycles:u # 2.746 GHz (75.05%) - 14,864,654 stalled-cycles-frontend:u # 0.56% frontend cycles idle (75.07%) - 291,338,514 stalled-cycles-backend:u # 11.02% backend cycles idle (74.64%) - 2,547,011,174 instructions:u # 0.96 insn per cycle - # 0.11 stalled cycles per insn (74.36%) - 0.985874750 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.266493e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.111955e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.526543e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.538455 sec + 2,216,644,376 cycles # 2.828 GHz + 3,102,394,165 instructions # 1.40 insn per cycle + 0.841378524 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939ee0) on address 0x1456d31c9000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x145968539dbf in ??? -#1 0x145968539d2b in ??? -#2 0x14596853b3e4 in ??? -#3 0x145960a0cb64 in ??? -#4 0x145960a09b38 in ??? -#5 0x1459609c7496 in ??? -#6 0x1459684d36e9 in ??? -#7 0x14596860749e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.173381e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193114e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.193114e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.419762 sec - 5,029,284,507 cycles:u # 3.488 GHz (74.89%) - 2,404,209 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.04%) - 674,718,308 stalled-cycles-backend:u # 13.42% backend cycles idle (75.04%) - 13,812,210,452 instructions:u # 2.75 insn per cycle - # 0.05 stalled cycles per insn (75.04%) - 1.444206774 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.822300e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003024e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003024e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.695633 sec + 4,892,910,077 cycles # 2.883 GHz + 13,801,787,359 instructions # 2.82 insn per cycle + 1.705964185 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63f5d0) on address 0x153411a99000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.896648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.972375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.972375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.887238 sec + 2,571,261,116 cycles # 2.883 GHz + 7,401,200,610 instructions # 2.88 insn per cycle + 0.906229412 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.154928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367723e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367723e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.543357 sec + 1,480,133,709 cycles # 2.701 GHz + 3,136,765,286 instructions # 2.12 insn per cycle + 0.561297241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.571891e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.844626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.844626e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.482249 sec + 1,314,348,676 cycles # 2.699 GHz + 2,923,288,921 instructions # 2.22 insn per cycle + 0.498803372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.408041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.532332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532332e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.706292 sec + 1,273,944,985 cycles # 1.792 GHz + 1,900,262,296 instructions # 1.49 insn per cycle + 0.723222352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index 32380b0244..7b59743406 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,115 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_14:41:01 +DATE: 2024-01-30_05:48:29 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.408359e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.101986e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.101986e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.481872 sec + 1,962,034,459 cycles # 2.824 GHz + 2,925,170,965 instructions # 1.49 insn per cycle + 0.753942373 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 933,839,268 cycles:u # 2.433 GHz (74.16%) - 2,759,499 stalled-cycles-frontend:u # 0.30% frontend cycles idle (74.40%) - 38,991,447 stalled-cycles-backend:u # 4.18% backend cycles idle (75.69%) - 1,391,319,621 instructions:u # 1.49 insn per cycle - # 0.03 stalled cycles per insn (75.61%) - 0.714875699 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe: Aborted - 3,210,881,286 cycles:u # 2.813 GHz (74.81%) - 30,342,476 stalled-cycles-frontend:u # 0.94% frontend cycles idle (75.10%) - 856,548,279 stalled-cycles-backend:u # 26.68% backend cycles idle (75.42%) - 3,341,623,817 instructions:u # 1.04 insn per cycle - # 0.26 stalled cycles per insn (75.24%) - 1.262099462 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.119182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.257748e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.257748e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.770186 sec + 2,924,566,680 cycles # 2.837 GHz + 4,475,846,392 instructions # 1.53 insn per cycle + 1.089161093 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939ee0) on address 0x153f85189000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x15421a4fbdbf in ??? -#1 0x15421a4fbd2b in ??? -#2 0x15421a4fd3e4 in ??? -#3 0x1542129ceb64 in ??? -#4 0x1542129cbb38 in ??? -#5 0x154212989496 in ??? -#6 0x15421a4956e9 in ??? -#7 0x15421a5c949e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.172468e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.192155e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.424936 sec - 5,033,997,033 cycles:u # 3.476 GHz (74.84%) - 2,462,644 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.07%) - 665,583,775 stalled-cycles-backend:u # 13.22% backend cycles idle (75.15%) - 13,809,895,327 instructions:u # 2.74 insn per cycle - # 0.05 stalled cycles per insn (75.15%) - 1.450408660 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.824024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002926e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.698586 sec + 4,927,814,709 cycles # 2.894 GHz + 13,806,118,322 instructions # 2.80 insn per cycle + 1.704123738 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63f5d0) on address 0x14e81f6b9000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.886173e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963508e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.900867 sec + 2,618,017,951 cycles # 2.892 GHz + 7,450,102,141 instructions # 2.85 insn per cycle + 0.906367581 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.122916e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345144e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.557525 sec + 1,528,674,468 cycles # 2.721 GHz + 3,187,083,360 instructions # 2.08 insn per cycle + 0.563020024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.528840e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.810605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.810605e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.496872 sec + 1,359,999,193 cycles # 2.712 GHz + 2,973,904,476 instructions # 2.19 insn per cycle + 0.502643224 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.332416e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.457397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.457397e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.738182 sec + 1,327,509,915 cycles # 1.788 GHz + 1,939,124,841 instructions # 1.46 insn per cycle + 0.743808066 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index bccb6906c3..4deacb88f2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,108 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_13:59:04 +DATE: 2024-01-30_05:04:44 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.642894e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200887e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.567165e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.451244 sec + 1,883,873,657 cycles # 2.821 GHz + 2,671,262,226 instructions # 1.42 insn per cycle + 0.747348766 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 740,918,663 cycles:u # 2.126 GHz (74.83%) - 2,590,438 stalled-cycles-frontend:u # 0.35% frontend cycles idle (76.36%) - 35,134,179 stalled-cycles-backend:u # 4.74% backend cycles idle (78.16%) - 1,223,153,146 instructions:u # 1.65 insn per cycle - # 0.03 stalled cycles per insn (74.88%) - 0.399372258 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe: Segmentation fault - 2,633,479,698 cycles:u # 2.738 GHz (75.04%) - 21,030,407 stalled-cycles-frontend:u # 0.80% frontend cycles idle (75.06%) - 860,456,505 stalled-cycles-backend:u # 32.67% backend cycles idle (74.56%) - 2,537,896,925 instructions:u # 0.96 insn per cycle - # 0.34 stalled cycles per insn (74.39%) - 0.985747886 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.228371e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.990649e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.395918e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.540844 sec + 2,218,030,903 cycles # 2.829 GHz + 3,154,136,532 instructions # 1.42 insn per cycle + 0.843504278 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939e30) on address 0x152882c79000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x152b17febdbf in ??? -#1 0x152b17febd2b in ??? -#2 0x152b17fed3e4 in ??? -#3 0x152b104beb64 in ??? -#4 0x152b104bbb38 in ??? -#5 0x152b10479496 in ??? -#6 0x152b17f856e9 in ??? -#7 0x152b180b949e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.173724e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.193378e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.193378e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.419080 sec - 5,007,260,230 cycles:u # 3.475 GHz (75.02%) - 2,377,599 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.02%) - 879,033,575 stalled-cycles-backend:u # 17.56% backend cycles idle (75.02%) - 13,839,303,650 instructions:u # 2.76 insn per cycle - # 0.06 stalled cycles per insn (75.03%) - 1.443186270 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.831536e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003712e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.690067 sec + 4,884,610,591 cycles # 2.883 GHz + 13,807,943,276 instructions # 2.83 insn per cycle + 1.700194727 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x666280) on address 0x14c889299000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.876876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953061e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.896918 sec + 2,573,000,483 cycles # 2.854 GHz + 7,407,132,972 instructions # 2.88 insn per cycle + 0.971480588 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.133331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.344053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.344053e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.546739 sec + 1,486,856,812 cycles # 2.696 GHz + 3,137,676,944 instructions # 2.11 insn per cycle + 0.563341736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.567673e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.839669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.839669e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.482732 sec + 1,314,507,412 cycles # 2.697 GHz + 2,925,746,939 instructions # 2.23 insn per cycle + 0.501062508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.394430e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.516439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.516439e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.710071 sec + 1,273,890,672 cycles # 1.782 GHz + 1,899,944,131 instructions # 1.49 insn per cycle + 0.727352268 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482467492595 +Relative difference = 5.286901344678233e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index b69cf112a4..1362a87ac8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,108 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_13:59:19 +DATE: 2024-01-30_05:05:03 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.327203e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210086e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.349272e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.446159 sec + 1,908,363,704 cycles # 2.829 GHz + 2,678,040,252 instructions # 1.40 insn per cycle + 0.749417997 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 760,941,120 cycles:u # 2.190 GHz (75.67%) - 2,577,448 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.40%) - 21,547,300 stalled-cycles-backend:u # 2.83% backend cycles idle (75.80%) - 1,198,745,029 instructions:u # 1.58 insn per cycle - # 0.02 stalled cycles per insn (76.69%) - 0.375099933 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,539,411,178 cycles:u # 2.802 GHz (75.28%) - 21,101,958 stalled-cycles-frontend:u # 0.83% frontend cycles idle (75.32%) - 853,164,959 stalled-cycles-backend:u # 33.60% backend cycles idle (75.30%) - 2,424,669,574 instructions:u # 0.95 insn per cycle - # 0.35 stalled cycles per insn (75.22%) - 0.944209481 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.267889e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.817352e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969269e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.483640 sec + 2,013,701,507 cycles # 2.833 GHz + 2,869,047,503 instructions # 1.42 insn per cycle + 0.770237631 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6937f00) on address 0x150f923fc000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x151227767dbf in ??? -#1 0x151227767d2b in ??? -#2 0x1512277693e4 in ??? -#3 0x15121fc3ab64 in ??? -#4 0x15121fc37b38 in ??? -#5 0x15121fbf5496 in ??? -#6 0x1512277016e9 in ??? -#7 0x15122783549e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.428037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.458241e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.458241e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.168612 sec - 4,133,128,597 cycles:u # 3.471 GHz (75.15%) - 2,028,802 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.15%) - 246,889,606 stalled-cycles-backend:u # 5.97% backend cycles idle (74.82%) - 12,624,578,314 instructions:u # 3.05 insn per cycle - # 0.02 stalled cycles per insn (74.82%) - 1.192857887 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.109983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136218e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136218e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.498447 sec + 4,345,988,139 cycles # 2.893 GHz + 12,596,967,872 instructions # 2.90 insn per cycle + 1.511882134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x14bb83fc4000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.116392e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.330955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330955e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547469 sec + 1,595,191,710 cycles # 2.889 GHz + 4,246,785,925 instructions # 2.66 insn per cycle + 0.566121323 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.705122e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.431194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.431194e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.309154 sec + 853,106,357 cycles # 2.719 GHz + 1,916,236,758 instructions # 2.25 insn per cycle + 0.322202646 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.291153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.186493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.186493e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.282114 sec + 781,605,305 cycles # 2.726 GHz + 1,797,850,243 instructions # 2.30 insn per cycle + 0.301017972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.544342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.998908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998908e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.384301 sec + 720,859,118 cycles # 1.854 GHz + 1,288,039,773 instructions # 1.79 insn per cycle + 0.402338897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 3197cced27..8cb59221d4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,115 +1,240 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_14:41:16 +DATE: 2024-01-30_05:48:48 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.444132e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.000397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.000397e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.460393 sec + 1,902,959,760 cycles # 2.835 GHz + 2,813,040,217 instructions # 1.48 insn per cycle + 0.731144371 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 802,769,560 cycles:u # 2.308 GHz (73.70%) - 2,883,518 stalled-cycles-frontend:u # 0.36% frontend cycles idle (75.71%) - 21,503,278 stalled-cycles-backend:u # 2.68% backend cycles idle (74.79%) - 1,254,584,929 instructions:u # 1.56 insn per cycle - # 0.02 stalled cycles per insn (74.79%) - 0.371915527 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe: Segmentation fault - 2,881,932,730 cycles:u # 2.848 GHz (74.69%) - 29,590,208 stalled-cycles-frontend:u # 1.03% frontend cycles idle (75.61%) - 854,011,561 stalled-cycles-backend:u # 29.63% backend cycles idle (75.50%) - 3,070,710,792 instructions:u # 1.07 insn per cycle - # 0.28 stalled cycles per insn (75.46%) - 1.032695848 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.962207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.533260e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.533260e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.631701 sec + 2,471,933,418 cycles # 2.836 GHz + 3,725,494,141 instructions # 1.51 insn per cycle + 0.929474422 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6937f00) on address 0x1463d05f4000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14666595fdbf in ??? -#1 0x14666595fd2b in ??? -#2 0x1466659613e4 in ??? -#3 0x14665de32b64 in ??? -#4 0x14665de2fb38 in ??? -#5 0x14665dded496 in ??? -#6 0x1466658f96e9 in ??? -#7 0x146665a2d49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.426628e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.456791e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.456791e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.171749 sec - 4,150,620,502 cycles:u # 3.475 GHz (74.60%) - 2,341,963 stalled-cycles-frontend:u # 0.06% frontend cycles idle (74.71%) - 249,108,394 stalled-cycles-backend:u # 6.00% backend cycles idle (75.00%) - 12,628,090,084 instructions:u # 3.04 insn per cycle - # 0.02 stalled cycles per insn (75.23%) - 1.196363960 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.095228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.121318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.121318e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.522701 sec + 4,367,827,655 cycles # 2.862 GHz + 12,601,331,452 instructions # 2.89 insn per cycle + 1.527862957 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x61d1a0) on address 0x15313d7cc000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.075499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.292736e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292736e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.559885 sec + 1,623,222,211 cycles # 2.878 GHz + 4,293,732,841 instructions # 2.65 insn per cycle + 0.565168184 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.618798e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.338072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.338072e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.317968 sec + 874,954,516 cycles # 2.715 GHz + 1,952,010,632 instructions # 2.23 insn per cycle + 0.323135602 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.140069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.015278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.015278e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.296217 sec + 805,080,990 cycles # 2.697 GHz + 1,834,280,964 instructions # 2.28 insn per cycle + 0.301462842 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Instantiate host Bridge (nevt=16384) +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.472935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.920053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.920053e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.395002 sec + 745,120,207 cycles # 1.866 GHz + 1,329,072,598 instructions # 1.78 insn per cycle + 0.400211929 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 0cf6149acc..a71ead3e03 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,108 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_13:59:33 +DATE: 2024-01-30_05:05:20 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.328749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215965e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.352409e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.447516 sec + 1,904,038,107 cycles # 2.819 GHz + 2,679,740,960 instructions # 1.41 insn per cycle + 0.754557698 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 758,925,185 cycles:u # 2.206 GHz (74.25%) - 2,574,941 stalled-cycles-frontend:u # 0.34% frontend cycles idle (74.49%) - 28,586,343 stalled-cycles-backend:u # 3.77% backend cycles idle (77.25%) - 1,212,929,372 instructions:u # 1.60 insn per cycle - # 0.02 stalled cycles per insn (77.63%) - 0.366863310 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe: Segmentation fault - 2,598,152,662 cycles:u # 2.864 GHz (74.45%) - 20,928,138 stalled-cycles-frontend:u # 0.81% frontend cycles idle (75.39%) - 845,268,440 stalled-cycles-backend:u # 32.53% backend cycles idle (75.31%) - 2,412,052,205 instructions:u # 0.93 insn per cycle - # 0.35 stalled cycles per insn (75.26%) - 0.930068923 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.182679e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.774687e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914662e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.483893 sec + 2,007,596,287 cycles # 2.824 GHz + 2,863,986,921 instructions # 1.43 insn per cycle + 0.770182944 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6937e50) on address 0x15510025c000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x1553955c7dbf in ??? -#1 0x1553955c7d2b in ??? -#2 0x1553955c93e4 in ??? -#3 0x15538da9ab64 in ??? -#4 0x15538da97b38 in ??? -#5 0x15538da55496 in ??? -#6 0x1553955616e9 in ??? -#7 0x15539569549e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.424138e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.454255e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.454255e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945525e+02 +- 1.186197e+02 ) GeV^-2 -TOTAL : 1.171627 sec - 4,157,880,455 cycles:u # 3.484 GHz (74.58%) - 2,042,320 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.76%) - 521,141,426 stalled-cycles-backend:u # 12.53% backend cycles idle (75.04%) - 12,614,300,431 instructions:u # 3.03 insn per cycle - # 0.04 stalled cycles per insn (75.21%) - 1.195482351 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.104449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131163e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.505306 sec + 4,350,737,729 cycles # 2.883 GHz + 12,588,700,465 instructions # 2.89 insn per cycle + 1.517040580 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x643e60) on address 0x14ffe51a4000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.107801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322563e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.548678 sec + 1,589,053,041 cycles # 2.872 GHz + 4,241,478,972 instructions # 2.67 insn per cycle + 0.565533397 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424686e-01 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.682195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.406347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406347e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.310286 sec + 851,032,417 cycles # 2.702 GHz + 1,913,907,734 instructions # 2.25 insn per cycle + 0.327654627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.251030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.131063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131063e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.283621 sec + 779,432,148 cycles # 2.704 GHz + 1,795,928,128 instructions # 2.30 insn per cycle + 0.301196370 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.530328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.979352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.979352e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.386557 sec + 722,333,254 cycles # 1.844 GHz + 1,287,373,146 instructions # 1.78 insn per cycle + 0.407217093 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 190fa5e35f..3f17b073e2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,108 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_13:59:47 +DATE: 2024-01-30_05:05:38 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.696364e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.334716e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.710197e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.455743 sec + 1,899,569,009 cycles # 2.822 GHz + 2,690,270,670 instructions # 1.42 insn per cycle + 0.752301124 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 748,191,462 cycles:u # 2.174 GHz (74.27%) - 2,454,240 stalled-cycles-frontend:u # 0.33% frontend cycles idle (75.57%) - 27,578,613 stalled-cycles-backend:u # 3.69% backend cycles idle (76.65%) - 1,231,148,808 instructions:u # 1.65 insn per cycle - # 0.02 stalled cycles per insn (74.50%) - 0.367922296 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe: Segmentation fault - 2,684,163,642 cycles:u # 2.783 GHz (74.18%) - 21,756,682 stalled-cycles-frontend:u # 0.81% frontend cycles idle (73.20%) - 853,260,207 stalled-cycles-backend:u # 31.79% backend cycles idle (75.20%) - 2,469,863,103 instructions:u # 0.92 insn per cycle - # 0.35 stalled cycles per insn (76.02%) - 0.985386297 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.256330e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.562668e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.544746 sec + 2,203,075,600 cycles # 2.810 GHz + 3,150,811,707 instructions # 1.43 insn per cycle + 0.843284004 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939ee0) on address 0x154ce61e9000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x154f7b54edbf in ??? -#1 0x154f7b54ed2b in ??? -#2 0x154f7b5503e4 in ??? -#3 0x154f73a21b64 in ??? -#4 0x154f73a1eb38 in ??? -#5 0x154f739dc496 in ??? -#6 0x154f7b4e86e9 in ??? -#7 0x154f7b61c49e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.167145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.186639e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.186639e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.427162 sec - 5,051,946,561 cycles:u # 3.484 GHz (74.72%) - 2,305,632 stalled-cycles-frontend:u # 0.05% frontend cycles idle (75.18%) - 850,828,752 stalled-cycles-backend:u # 16.84% backend cycles idle (75.18%) - 13,842,447,352 instructions:u # 2.74 insn per cycle - # 0.06 stalled cycles per insn (75.19%) - 1.452171813 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.796791e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000139e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.696226 sec + 4,903,205,903 cycles # 2.884 GHz + 13,824,553,372 instructions # 2.82 insn per cycle + 1.707005330 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x63f5d0) on address 0x146962c59000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.870381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.944831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.944831e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.899365 sec + 2,603,553,029 cycles # 2.880 GHz + 7,349,607,266 instructions # 2.82 insn per cycle + 0.916195330 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.167537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.382178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382178e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.541013 sec + 1,471,630,021 cycles # 2.697 GHz + 3,084,577,547 instructions # 2.10 insn per cycle + 0.558891839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.661938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.948590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.948590e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.471097 sec + 1,285,426,170 cycles # 2.700 GHz + 2,873,286,331 instructions # 2.24 insn per cycle + 0.489244149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.322096e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.437722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.437722e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.731479 sec + 1,311,962,532 cycles # 1.782 GHz + 1,915,335,630 instructions # 1.46 insn per cycle + 0.746286183 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index 87faa41e06..7294ddea09 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,108 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux -OMPFLAGS= -AVX=avx2 +Building in /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +OMPFLAGS=-fopenmp +AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -RNDGEN=hasNoCurand -Building in BUILDDIR=build.avx2_d_inl0_hrd0 for tag=avx2_d_inl0_hrd0_hasNoCurand (USEBUILDDIR is set = 1) +RNDGEN=hasCurand +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/pfs/lustrep1/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-01-31_14:00:02 +DATE: 2024-01-30_05:05:56 -On uan04 [CPU: AMD EPYC 7A53 64-Core Processor] [GPU: AMD INSTINCT MI200]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.635631e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.151573e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502163e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.454560 sec + 1,887,319,720 cycles # 2.810 GHz + 2,686,521,155 instructions # 1.42 insn per cycle + 0.777570467 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 727,187,414 cycles:u # 2.061 GHz (75.95%) - 2,715,028 stalled-cycles-frontend:u # 0.37% frontend cycles idle (74.84%) - 41,823,987 stalled-cycles-backend:u # 5.75% backend cycles idle (70.58%) - 1,257,681,608 instructions:u # 1.73 insn per cycle - # 0.03 stalled cycles per insn (74.11%) - 0.375261475 seconds time elapsed +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% ......................................................................... -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe: Segmentation fault - 2,633,015,947 cycles:u # 2.735 GHz (75.09%) - 21,153,768 stalled-cycles-frontend:u # 0.80% frontend cycles idle (74.68%) - 867,174,013 stalled-cycles-backend:u # 32.93% backend cycles idle (73.99%) - 2,529,840,015 instructions:u # 0.96 insn per cycle - # 0.34 stalled cycles per insn (74.62%) - 0.985921030 seconds time elapsed +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.262333e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.410963e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.538673 sec + 2,205,224,099 cycles # 2.822 GHz + 3,150,366,927 instructions # 1.43 insn per cycle + 0.838863876 seconds time elapsed ------------------------------------------------------------------------- -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 -cmpExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 -Memory access fault by GPU node-4 (Agent handle: 0x6939e30) on address 0x14e191159000. Reason: Unknown. - -Program received signal SIGABRT: Process abort signal. - -Backtrace for this error: -#0 0x14e4264c7dbf in ??? -#1 0x14e4264c7d2b in ??? -#2 0x14e4264c93e4 in ??? -#3 0x14e41e99ab64 in ??? -#4 0x14e41e997b38 in ??? -#5 0x14e41e955496 in ??? -#6 0x14e4264616e9 in ??? -#7 0x14e42659549e in ??? -#8 0xffffffffffffffff in ??? -Avg ME (C++/CUDA) = -Avg ME (F77/CUDA) = -ERROR! Fortran calculation (F77/CUDA) crashed +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.2.0] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -EvtsPerSec[Rmb+ME] (23) = ( 1.168746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.188251e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.188251e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.425088 sec - 5,052,848,302 cycles:u # 3.491 GHz (74.63%) - 2,441,774 stalled-cycles-frontend:u # 0.05% frontend cycles idle (74.88%) - 796,259,995 stalled-cycles-backend:u # 15.76% backend cycles idle (75.11%) - 13,840,635,874 instructions:u # 2.74 insn per cycle - # 0.06 stalled cycles per insn (75.13%) - 1.449175351 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.769998e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.971532e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.971532e+04 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.700454 sec + 4,910,062,395 cycles # 2.880 GHz + 13,831,764,171 instructions # 2.82 insn per cycle + 1.712052278 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /users/valassia/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -Memory access fault by GPU node-4 (Agent handle: 0x666280) on address 0x146fe5329000. Reason: Unknown. +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.857842e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.932046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932046e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.905414 sec + 2,615,099,772 cycles # 2.873 GHz + 7,353,136,311 instructions # 2.81 insn per cycle + 0.925236073 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482734618697 +Relative difference = 5.099411406595165e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.160999e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.374264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.374264e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.541919 sec + 1,475,084,747 cycles # 2.698 GHz + 3,084,915,220 instructions # 2.09 insn per cycle + 0.559487031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.676411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.967587e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.469154 sec + 1,285,211,957 cycles # 2.712 GHz + 2,875,140,516 instructions # 2.24 insn per cycle + 0.485058196 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.334432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.451352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.451352e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.728206 sec + 1,313,839,367 cycles # 1.794 GHz + 1,915,620,790 instructions # 1.46 insn per cycle + 0.743678029 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) +------------------------------------------------------------------------- +runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 1.424749e-01 +Avg ME (F77/C++) = 0.14247482643254802 +Relative difference = 5.163537715318965e-07 +OK (relative difference <= 5E-3) +========================================================================= + +TEST COMPLETED